-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
🧐 Last exploration of the pklm test before implementation in Qolmat
- Loading branch information
1 parent
c87d08d
commit e13f359
Showing
15 changed files
with
724,506 additions
and
229 deletions.
There are no files selected for viewing
94 changes: 94 additions & 0 deletions
94
examples/pklm/Datasets_creation_for_comparison/Dataset_creation.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "9f9c3954", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import numpy as np\n", | ||
"import pandas as pd" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "8a57d1c9", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def create_df_with_nan(n_rows: int, n_cols: int) -> pd.DataFrame:\n", | ||
" data = {f\"Colonne_{i}\": np.random.normal(size=n_rows).astype(float) for i in range(n_cols)}\n", | ||
" df = pd.DataFrame(data)\n", | ||
" nb_valeurs_manquantes = int(0.35 * df.size)\n", | ||
" indices_valeurs_manquantes = np.random.choice(df.size, nb_valeurs_manquantes, replace=False)\n", | ||
" df.values.flat[indices_valeurs_manquantes] = np.nan\n", | ||
" return df" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "1aa325a7", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"create_df_with_nan(500, 10).to_csv(\"df_for_bug.csv\", index=False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 16, | ||
"id": "2fb7eb43", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"config = {\n", | ||
" \"df1\": {\"n_rows\": 200, \"n_cols\": 2},\n", | ||
" \"df2\": {\"n_rows\": 500, \"n_cols\": 2},\n", | ||
" \"df3\": {\"n_rows\": 500, \"n_cols\": 4},\n", | ||
" \"df4\": {\"n_rows\": 1000, \"n_cols\": 4},\n", | ||
" \"df5\": {\"n_rows\": 1000, \"n_cols\": 6},\n", | ||
" \"df6\": {\"n_rows\": 10000, \"n_cols\": 6},\n", | ||
" \"df7\": {\"n_rows\": 10000, \"n_cols\": 10},\n", | ||
" \"df8\": {\"n_rows\": 100000, \"n_cols\": 10},\n", | ||
" \"df9\": {\"n_rows\": 100000, \"n_cols\": 15}\n", | ||
"}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 18, | ||
"id": "e542d35c", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"for df_name, params in config.items():\n", | ||
" df = create_df_with_nan(**params)\n", | ||
" df.to_csv(f\"{df_name}.csv\", index=False)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.19" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"id": "d1d98837", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"--- 9.613310098648071 seconds ---\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import time\n", | ||
"\n", | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"\n", | ||
"from sklearn.ensemble import RandomForestClassifier\n", | ||
"\n", | ||
"\n", | ||
"data = {f\"Colonne_{i}\": np.random.normal(size=200).astype(float) for i in range(6)}\n", | ||
"df = pd.DataFrame(data)\n", | ||
"\n", | ||
"_, n_col = df.shape\n", | ||
"\n", | ||
"result = 0\n", | ||
"\n", | ||
"start_time = time.time()\n", | ||
"\n", | ||
"for _ in range (100):\n", | ||
" df_classif = df.sample(np.random.randint(2, n_col), axis=1)\n", | ||
" df_classif = df_classif.rename(columns={df_classif.columns[0]: 'target'})\n", | ||
" df_classif['target'] = 1 * df_classif['target'].isnull()\n", | ||
" X, y = df_classif.loc[:, df_classif.columns != \"target\"], df_classif[\"target\"]\n", | ||
" clf = RandomForestClassifier(\n", | ||
" n_estimators=200,\n", | ||
" max_features=None,\n", | ||
" min_samples_split=10,\n", | ||
" bootstrap=True,\n", | ||
" oob_score=True,\n", | ||
" random_state=42\n", | ||
" )\n", | ||
" clf.fit(X, y)\n", | ||
" \n", | ||
" result += clf.score(X, y)\n", | ||
" \n", | ||
"print(\"--- %s seconds ---\" % (time.time() - start_time))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "91583a21", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"--- 26.025609016418457 seconds ---\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import time\n", | ||
"\n", | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"\n", | ||
"from sklearn.ensemble import RandomForestClassifier\n", | ||
"\n", | ||
"\n", | ||
"data = {f\"Colonne_{i}\": np.random.normal(size=200).astype(float) for i in range(6)}\n", | ||
"df = pd.DataFrame(data)\n", | ||
"\n", | ||
"_, n_col = df.shape\n", | ||
"\n", | ||
"result = 0\n", | ||
"\n", | ||
"start_time = time.time()\n", | ||
"\n", | ||
"for _ in range (100):\n", | ||
" df_classif = df.sample(np.random.randint(2, n_col), axis=1)\n", | ||
" df_classif = df_classif.rename(columns={df_classif.columns[0]: 'target'})\n", | ||
" df_classif['target'] = 1 * df_classif['target'].isnull()\n", | ||
" X, y = df_classif.loc[:, df_classif.columns != \"target\"], df_classif[\"target\"]\n", | ||
" clf = RandomForestClassifier(\n", | ||
" n_estimators=200,\n", | ||
" max_features=None,\n", | ||
" min_samples_split=10,\n", | ||
" bootstrap=True,\n", | ||
" oob_score=True,\n", | ||
" random_state=42,\n", | ||
" n_jobs=-1\n", | ||
" )\n", | ||
" clf.fit(X, y)\n", | ||
" \n", | ||
" result += clf.score(X, y)\n", | ||
" \n", | ||
"print(\"--- %s seconds ---\" % (time.time() - start_time))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"id": "c651c52b", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"--- 1.8062052726745605 seconds ---\n", | ||
"Result: 100.0\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import time\n", | ||
"\n", | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"from sklearn.ensemble import RandomForestClassifier\n", | ||
"from joblib import Parallel, delayed\n", | ||
"\n", | ||
"# Génération de données\n", | ||
"data = {f\"Colonne_{i}\": np.random.normal(size=200).astype(float) for i in range(6)}\n", | ||
"df = pd.DataFrame(data)\n", | ||
"_, n_col = df.shape\n", | ||
"\n", | ||
"# Fonction pour effectuer une itération du processus\n", | ||
"def process_iteration(i):\n", | ||
" df_classif = df.sample(np.random.randint(2, n_col), axis=1)\n", | ||
" df_classif = df_classif.rename(columns={df_classif.columns[0]: 'target'})\n", | ||
" df_classif['target'] = 1 * df_classif['target'].isnull()\n", | ||
" X, y = df_classif.loc[:, df_classif.columns != \"target\"], df_classif[\"target\"]\n", | ||
" clf = RandomForestClassifier(\n", | ||
" n_estimators=200,\n", | ||
" max_features=None,\n", | ||
" min_samples_split=10,\n", | ||
" bootstrap=True,\n", | ||
" oob_score=True,\n", | ||
" random_state=42\n", | ||
" )\n", | ||
" clf.fit(X, y)\n", | ||
" return clf.score(X, y)\n", | ||
"\n", | ||
"start_time = time.time()\n", | ||
"\n", | ||
"# Utilisation de joblib pour paralléliser les itérations\n", | ||
"results = Parallel(n_jobs=-1)(delayed(process_iteration)(i) for i in range(100))\n", | ||
"\n", | ||
"# Calcul du résultat final\n", | ||
"result = sum(results)\n", | ||
"print(\"--- %s seconds ---\" % (time.time() - start_time))\n", | ||
"print(f\"Result: {result}\")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.19" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.