From 3c57dc433ea511a2fd62948887b7a052e34fd54e Mon Sep 17 00:00:00 2001
From: Mikhail Beck <mikhail.beck@exasol.com>
Date: Thu, 30 Nov 2023 11:49:07 +0000
Subject: [PATCH] Updated sklearn notebooks (#86)

Co-authored-by: Torsten Kilias <tkilias@users.noreply.github.com>
---
 .../files/notebook/data/data_abalone.ipynb    |  6 ++--
 .../files/notebook/data/data_telescope.ipynb  | 15 ++------
 .../sklearn/sklearn_predict_abalone.ipynb     |  9 +++--
 .../sklearn/sklearn_predict_telescope.ipynb   |  9 +++--
 .../sklearn/sklearn_predict_udf.ipynb         |  5 ++-
 .../sklearn/sklearn_train_abalone.ipynb       | 17 +++------
 .../sklearn/sklearn_train_telescope.ipynb     | 35 +++----------------
 7 files changed, 25 insertions(+), 71 deletions(-)

diff --git a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/data/data_abalone.ipynb b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/data/data_abalone.ipynb
index 560ecc44..4d9805f1 100644
--- a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/data/data_abalone.ipynb
+++ b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/data/data_abalone.ipynb
@@ -106,7 +106,7 @@
    "outputs": [],
    "source": [
     "from sklearn.model_selection import train_test_split\n",
-    "import pyexasol\n",
+    "from exasol.connections import open_pyexasol_connection\n",
     "\n",
     "# Split the data into train and test sets\n",
     "df_train, df_test = train_test_split(df, test_size=0.2)\n",
@@ -114,13 +114,11 @@
     "train_table = 'ABALONE_TRAIN'\n",
     "test_table = 'ABALONE_TEST'\n",
     "column_desc = [' '.join(c) for c in column_def]\n",
-    "dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
     "\n",
     "stopwatch = Stopwatch()\n",
     "\n",
     "# Create an Exasol connection\n",
-    "dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
-    "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
+    "with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
     "\n",
     "    # Create tables\n",
     "    sql = f'CREATE OR REPLACE TABLE \"{sb_config.SCHEMA}\".\"{train_table}\"({\", \".join(column_desc)})'\n",
diff --git a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/data/data_telescope.ipynb b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/data/data_telescope.ipynb
index d39a2f1a..e8487060 100644
--- a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/data/data_telescope.ipynb
+++ b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/data/data_telescope.ipynb
@@ -110,7 +110,7 @@
    "outputs": [],
    "source": [
     "from sklearn.model_selection import train_test_split\n",
-    "import pyexasol\n",
+    "from exasol.connections import open_pyexasol_connection\n",
     "\n",
     "# Split the data into train and test sets\n",
     "df_train, df_test = train_test_split(df, test_size=0.2)\n",
@@ -122,8 +122,7 @@
     "stopwatch = Stopwatch()\n",
     "\n",
     "# Create an Exasol connection\n",
-    "dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
-    "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
+    "with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
     "\n",
     "    # Create tables\n",
     "    sql = f'CREATE OR REPLACE TABLE \"{sb_config.SCHEMA}\".\"{train_table}\"({\", \".join(column_desc)})'\n",
@@ -143,15 +142,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "3490c957-366e-425f-91ae-a645ccabbfe0",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e53cbb99-8387-4165-a930-e4f24abfeaee",
+   "id": "6645e76c-6a6e-48f3-a668-c1fd8717d7f2",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_abalone.ipynb b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_abalone.ipynb
index 33d38d53..8b34e8fb 100644
--- a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_abalone.ipynb
+++ b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_abalone.ipynb
@@ -50,17 +50,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pyexasol\n",
+    "from exasol.connections import open_pyexasol_connection, get_udf_bucket_path\n",
     "from stopwatch import Stopwatch\n",
     "\n",
     "target_column = 'RINGS'\n",
-    "bfs_model_path = f'/buckets/{sb_config.BUCKETFS_SERVICE}/{sb_config.BUCKETFS_BUCKET}/abalone_svm_model.pkl'\n",
+    "bfs_model_path = get_udf_bucket_path(sb_config) + '/abalone_svm_model.pkl'\n",
     "params = {'schema': sb_config.SCHEMA, 'test_table': 'ABALONE_TEST', 'model_path': bfs_model_path}\n",
-    "dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
     "\n",
     "stopwatch = Stopwatch()\n",
     "\n",
-    "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
+    "with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
     "    # Get the list of feature columns\n",
     "    sql = 'SELECT * FROM {schema!q}.{test_table!q} LIMIT 1'\n",
     "    df_tmp = conn.export_to_pandas(query_or_table=sql, query_params=params)\n",
@@ -98,7 +97,7 @@
     "import matplotlib.pyplot as plt\n",
     "\n",
     "# Get the ground truth labels for the test set.\n",
-    "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
+    "with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
     "    sql = f'SELECT ROWID AS [sample_id], [{target_column}] FROM {{schema!q}}.{{test_table!q}}'\n",
     "    df_true = conn.export_to_pandas(query_or_table=sql, query_params=params)\n",
     "\n",
diff --git a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_telescope.ipynb b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_telescope.ipynb
index ed20774a..ffb66121 100644
--- a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_telescope.ipynb
+++ b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_telescope.ipynb
@@ -50,17 +50,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pyexasol\n",
+    "from exasol.connections import open_pyexasol_connection, get_udf_bucket_path\n",
     "from stopwatch import Stopwatch\n",
     "\n",
     "target_column = 'CLASS'\n",
-    "bfs_model_path = f'/buckets/{sb_config.BUCKETFS_SERVICE}/{sb_config.BUCKETFS_BUCKET}/telescope_tree_model.pkl'\n",
+    "bfs_model_path = get_udf_bucket_path(sb_config) + '/telescope_tree_model.pkl'\n",
     "params = {'schema': sb_config.SCHEMA, 'test_table': 'TELESCOPE_TEST', 'model_path': bfs_model_path}\n",
-    "dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
     "\n",
     "stopwatch = Stopwatch()\n",
     "\n",
-    "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
+    "with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
     "    # Get the list of feature columns\n",
     "    sql = 'SELECT * FROM {schema!i}.{test_table!i} LIMIT 1'\n",
     "    df_tmp = conn.export_to_pandas(query_or_table=sql, query_params=params)\n",
@@ -98,7 +97,7 @@
     "import matplotlib.pyplot as plt\n",
     "\n",
     "# Get the ground truth labels for the test set.\n",
-    "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
+    "with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
     "    sql = f'SELECT ROWID AS [sample_id], [{target_column}] FROM {{schema!q}}.{{test_table!q}}'\n",
     "    df_true = conn.export_to_pandas(query_or_table=sql, query_params=params)\n",
     "\n",
diff --git a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_udf.ipynb b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_udf.ipynb
index 7f70ca6e..583993dd 100644
--- a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_udf.ipynb
+++ b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_udf.ipynb
@@ -48,7 +48,7 @@
    "outputs": [],
    "source": [
     "import textwrap\n",
-    "import pyexasol\n",
+    "from exasol.connections import open_pyexasol_connection\n",
     "from stopwatch import Stopwatch\n",
     "\n",
     "stopwatch = Stopwatch()\n",
@@ -104,8 +104,7 @@
     "/\n",
     "\"\"\")\n",
     "\n",
-    "dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
-    "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
+    "with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
     "    conn.execute(query=sql, query_params={'schema': sb_config.SCHEMA})\n",
     "\n",
     "print(f\"Creating prediction script took: {stopwatch}\")"
diff --git a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_abalone.ipynb b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_abalone.ipynb
index f1c4709d..aa017d2d 100644
--- a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_abalone.ipynb
+++ b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_abalone.ipynb
@@ -52,14 +52,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pyexasol\n",
+    "from exasol.connections import open_pyexasol_connection\n",
     "from sklearn.model_selection import train_test_split\n",
     "from stopwatch import Stopwatch\n",
     "\n",
     "stopwatch = Stopwatch()\n",
     "\n",
-    "dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
-    "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
+    "with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
     "    df = conn.export_to_pandas(query_or_table=(sb_config.SCHEMA, 'ABALONE_TRAIN'))\n",
     "\n",
     "X, y = df.drop(columns='RINGS'), df['RINGS']\n",
@@ -199,20 +198,14 @@
    "outputs": [],
    "source": [
     "import pickle\n",
-    "from exasol.bucketfs import Service\n",
+    "from exasol.connections import open_bucketfs_connection\n",
     "\n",
     "MODEL_FILE = 'abalone_svm_model.pkl'\n",
     "\n",
-    "# Setup the connection parameters.\n",
-    "buckfs_url_prefix = 'https' if sb_config.BUCKETFS_USE_HTTPS == 'True' else 'http'\n",
-    "buckfs_url = f'{buckfs_url_prefix}://{sb_config.EXTERNAL_HOST_NAME}:{sb_config.BUCKETFS_PORT}'\n",
-    "buckfs_credentials = {sb_config.BUCKETFS_BUCKET: {'username': sb_config.BUCKETFS_USER, 'password': sb_config.BUCKETFS_PASSWORD}}\n",
-    "\n",
     "stopwatch = Stopwatch()\n",
     "\n",
-    "# Connect to the BucketFS service and navigate to the bucket of choice.\n",
-    "bucketfs = Service(buckfs_url, buckfs_credentials)\n",
-    "bucket = bucketfs[sb_config.BUCKETFS_BUCKET]\n",
+    "# Connect to the BucketFS service\n",
+    "bucket = open_bucketfs_connection(sb_config)\n",
     "\n",
     "# Serialize the model into a byte-array and upload it to the BucketFS, \n",
     "# where it will be saved in the file with the specified name.\n",
diff --git a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_telescope.ipynb b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_telescope.ipynb
index 897d37b2..cf9949fc 100644
--- a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_telescope.ipynb
+++ b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_telescope.ipynb
@@ -52,13 +52,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pyexasol\n",
+    "from exasol.connections import open_pyexasol_connection\n",
     "from stopwatch import Stopwatch\n",
     "\n",
     "stopwatch = Stopwatch()\n",
     "\n",
-    "dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
-    "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
+    "with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
     "    df = conn.export_to_pandas(query_or_table=(sb_config.SCHEMA, 'TELESCOPE_TRAIN'))\n",
     "\n",
     "print(f\"Loading the data took: {stopwatch}\")"
@@ -145,20 +144,14 @@
    "outputs": [],
    "source": [
     "import pickle\n",
-    "from exasol.bucketfs import Service\n",
+    "from exasol.connections import open_bucketfs_connection\n",
     "\n",
     "MODEL_FILE = 'telescope_tree_model.pkl'\n",
     "\n",
-    "# Setup the connection parameters.\n",
-    "buckfs_url_prefix = 'https' if sb_config.BUCKETFS_USE_HTTPS == 'True' else 'http'\n",
-    "buckfs_url = f'{buckfs_url_prefix}://{sb_config.EXTERNAL_HOST_NAME}:{sb_config.BUCKETFS_PORT}'\n",
-    "buckfs_credentials = {sb_config.BUCKETFS_BUCKET: {'username': sb_config.BUCKETFS_USER, 'password': sb_config.BUCKETFS_PASSWORD}}\n",
-    "\n",
     "stopwatch = Stopwatch()\n",
     "\n",
-    "# Connect to the BucketFS service and navigate to the bucket of choice.\n",
-    "bucketfs = Service(buckfs_url, buckfs_credentials)\n",
-    "bucket = bucketfs[sb_config.BUCKETFS_BUCKET]\n",
+    "# Connect to the BucketFS service\n",
+    "bucket = open_bucketfs_connection(sb_config)\n",
     "\n",
     "# Serialize the model into a byte-array and upload it to the BucketFS, \n",
     "# where it will be saved in the file with the specified name.\n",
@@ -174,24 +167,6 @@
    "source": [
     "Now we are ready to use this model in our SQL queries. This will be demonstrated in the [following notebook](sklearn_predict_telescope.ipynb)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0301211d-8520-4f66-8727-114f3292bcd6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "buckfs_url"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ec99ed24-d6ce-46bf-97f3-9d0b1c38aade",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {