From 3c57dc433ea511a2fd62948887b7a052e34fd54e Mon Sep 17 00:00:00 2001 From: Mikhail Beck Date: Thu, 30 Nov 2023 11:49:07 +0000 Subject: [PATCH] Updated sklearn notebooks (#86) Co-authored-by: Torsten Kilias --- .../files/notebook/data/data_abalone.ipynb | 6 ++-- .../files/notebook/data/data_telescope.ipynb | 15 ++------ .../sklearn/sklearn_predict_abalone.ipynb | 9 +++-- .../sklearn/sklearn_predict_telescope.ipynb | 9 +++-- .../sklearn/sklearn_predict_udf.ipynb | 5 ++- .../sklearn/sklearn_train_abalone.ipynb | 17 +++------ .../sklearn/sklearn_train_telescope.ipynb | 35 +++---------------- 7 files changed, 25 insertions(+), 71 deletions(-) diff --git a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/data/data_abalone.ipynb b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/data/data_abalone.ipynb index 560ecc44..4d9805f1 100644 --- a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/data/data_abalone.ipynb +++ b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/data/data_abalone.ipynb @@ -106,7 +106,7 @@ "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", - "import pyexasol\n", + "from exasol.connections import open_pyexasol_connection\n", "\n", "# Split the data into train and test sets\n", "df_train, df_test = train_test_split(df, test_size=0.2)\n", @@ -114,13 +114,11 @@ "train_table = 'ABALONE_TRAIN'\n", "test_table = 'ABALONE_TEST'\n", "column_desc = [' '.join(c) for c in column_def]\n", - "dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n", "\n", "stopwatch = Stopwatch()\n", "\n", "# Create an Exasol connection\n", - "dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n", - "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n", + "with open_pyexasol_connection(sb_config, compression=True) as conn:\n", "\n", " # Create tables\n", " sql = f'CREATE OR REPLACE TABLE \"{sb_config.SCHEMA}\".\"{train_table}\"({\", \".join(column_desc)})'\n", diff --git a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/data/data_telescope.ipynb b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/data/data_telescope.ipynb index d39a2f1a..e8487060 100644 --- a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/data/data_telescope.ipynb +++ b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/data/data_telescope.ipynb @@ -110,7 +110,7 @@ "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", - "import pyexasol\n", + "from exasol.connections import open_pyexasol_connection\n", "\n", "# Split the data into train and test sets\n", "df_train, df_test = train_test_split(df, test_size=0.2)\n", @@ -122,8 +122,7 @@ "stopwatch = Stopwatch()\n", "\n", "# Create an Exasol connection\n", - "dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n", - "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n", + "with open_pyexasol_connection(sb_config, compression=True) as conn:\n", "\n", " # Create tables\n", " sql = f'CREATE OR REPLACE TABLE \"{sb_config.SCHEMA}\".\"{train_table}\"({\", \".join(column_desc)})'\n", @@ -143,15 +142,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3490c957-366e-425f-91ae-a645ccabbfe0", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e53cbb99-8387-4165-a930-e4f24abfeaee", + "id": "6645e76c-6a6e-48f3-a668-c1fd8717d7f2", "metadata": {}, "outputs": [], "source": [] diff --git a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_abalone.ipynb b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_abalone.ipynb index 33d38d53..8b34e8fb 100644 --- a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_abalone.ipynb +++ b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_abalone.ipynb @@ -50,17 +50,16 @@ "metadata": {}, "outputs": [], "source": [ - "import pyexasol\n", + "from exasol.connections import open_pyexasol_connection, get_udf_bucket_path\n", "from stopwatch import Stopwatch\n", "\n", "target_column = 'RINGS'\n", - "bfs_model_path = f'/buckets/{sb_config.BUCKETFS_SERVICE}/{sb_config.BUCKETFS_BUCKET}/abalone_svm_model.pkl'\n", + "bfs_model_path = get_udf_bucket_path(sb_config) + '/abalone_svm_model.pkl'\n", "params = {'schema': sb_config.SCHEMA, 'test_table': 'ABALONE_TEST', 'model_path': bfs_model_path}\n", - "dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n", "\n", "stopwatch = Stopwatch()\n", "\n", - "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n", + "with open_pyexasol_connection(sb_config, compression=True) as conn:\n", " # Get the list of feature columns\n", " sql = 'SELECT * FROM {schema!q}.{test_table!q} LIMIT 1'\n", " df_tmp = conn.export_to_pandas(query_or_table=sql, query_params=params)\n", @@ -98,7 +97,7 @@ "import matplotlib.pyplot as plt\n", "\n", "# Get the ground truth labels for the test set.\n", - "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n", + "with open_pyexasol_connection(sb_config, compression=True) as conn:\n", " sql = f'SELECT ROWID AS [sample_id], [{target_column}] FROM {{schema!q}}.{{test_table!q}}'\n", " df_true = conn.export_to_pandas(query_or_table=sql, query_params=params)\n", "\n", diff --git a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_telescope.ipynb b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_telescope.ipynb index ed20774a..ffb66121 100644 --- a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_telescope.ipynb +++ b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_telescope.ipynb @@ -50,17 +50,16 @@ "metadata": {}, "outputs": [], "source": [ - "import pyexasol\n", + "from exasol.connections import open_pyexasol_connection, get_udf_bucket_path\n", "from stopwatch import Stopwatch\n", "\n", "target_column = 'CLASS'\n", - "bfs_model_path = f'/buckets/{sb_config.BUCKETFS_SERVICE}/{sb_config.BUCKETFS_BUCKET}/telescope_tree_model.pkl'\n", + "bfs_model_path = get_udf_bucket_path(sb_config) + '/telescope_tree_model.pkl'\n", "params = {'schema': sb_config.SCHEMA, 'test_table': 'TELESCOPE_TEST', 'model_path': bfs_model_path}\n", - "dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n", "\n", "stopwatch = Stopwatch()\n", "\n", - "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n", + "with open_pyexasol_connection(sb_config, compression=True) as conn:\n", " # Get the list of feature columns\n", " sql = 'SELECT * FROM {schema!i}.{test_table!i} LIMIT 1'\n", " df_tmp = conn.export_to_pandas(query_or_table=sql, query_params=params)\n", @@ -98,7 +97,7 @@ "import matplotlib.pyplot as plt\n", "\n", "# Get the ground truth labels for the test set.\n", - "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n", + "with open_pyexasol_connection(sb_config, compression=True) as conn:\n", " sql = f'SELECT ROWID AS [sample_id], [{target_column}] FROM {{schema!q}}.{{test_table!q}}'\n", " df_true = conn.export_to_pandas(query_or_table=sql, query_params=params)\n", "\n", diff --git a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_udf.ipynb b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_udf.ipynb index 7f70ca6e..583993dd 100644 --- a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_udf.ipynb +++ b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_predict_udf.ipynb @@ -48,7 +48,7 @@ "outputs": [], "source": [ "import textwrap\n", - "import pyexasol\n", + "from exasol.connections import open_pyexasol_connection\n", "from stopwatch import Stopwatch\n", "\n", "stopwatch = Stopwatch()\n", @@ -104,8 +104,7 @@ "/\n", "\"\"\")\n", "\n", - "dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n", - "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n", + "with open_pyexasol_connection(sb_config, compression=True) as conn:\n", " conn.execute(query=sql, query_params={'schema': sb_config.SCHEMA})\n", "\n", "print(f\"Creating prediction script took: {stopwatch}\")" diff --git a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_abalone.ipynb b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_abalone.ipynb index f1c4709d..aa017d2d 100644 --- a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_abalone.ipynb +++ b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_abalone.ipynb @@ -52,14 +52,13 @@ "metadata": {}, "outputs": [], "source": [ - "import pyexasol\n", + "from exasol.connections import open_pyexasol_connection\n", "from sklearn.model_selection import train_test_split\n", "from stopwatch import Stopwatch\n", "\n", "stopwatch = Stopwatch()\n", "\n", - "dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n", - "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n", + "with open_pyexasol_connection(sb_config, compression=True) as conn:\n", " df = conn.export_to_pandas(query_or_table=(sb_config.SCHEMA, 'ABALONE_TRAIN'))\n", "\n", "X, y = df.drop(columns='RINGS'), df['RINGS']\n", @@ -199,20 +198,14 @@ "outputs": [], "source": [ "import pickle\n", - "from exasol.bucketfs import Service\n", + "from exasol.connections import open_bucketfs_connection\n", "\n", "MODEL_FILE = 'abalone_svm_model.pkl'\n", "\n", - "# Setup the connection parameters.\n", - "buckfs_url_prefix = 'https' if sb_config.BUCKETFS_USE_HTTPS == 'True' else 'http'\n", - "buckfs_url = f'{buckfs_url_prefix}://{sb_config.EXTERNAL_HOST_NAME}:{sb_config.BUCKETFS_PORT}'\n", - "buckfs_credentials = {sb_config.BUCKETFS_BUCKET: {'username': sb_config.BUCKETFS_USER, 'password': sb_config.BUCKETFS_PASSWORD}}\n", - "\n", "stopwatch = Stopwatch()\n", "\n", - "# Connect to the BucketFS service and navigate to the bucket of choice.\n", - "bucketfs = Service(buckfs_url, buckfs_credentials)\n", - "bucket = bucketfs[sb_config.BUCKETFS_BUCKET]\n", + "# Connect to the BucketFS service\n", + "bucket = open_bucketfs_connection(sb_config)\n", "\n", "# Serialize the model into a byte-array and upload it to the BucketFS, \n", "# where it will be saved in the file with the specified name.\n", diff --git a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_telescope.ipynb b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_telescope.ipynb index 897d37b2..cf9949fc 100644 --- a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_telescope.ipynb +++ b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_telescope.ipynb @@ -52,13 +52,12 @@ "metadata": {}, "outputs": [], "source": [ - "import pyexasol\n", + "from exasol.connections import open_pyexasol_connection\n", "from stopwatch import Stopwatch\n", "\n", "stopwatch = Stopwatch()\n", "\n", - "dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n", - "with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n", + "with open_pyexasol_connection(sb_config, compression=True) as conn:\n", " df = conn.export_to_pandas(query_or_table=(sb_config.SCHEMA, 'TELESCOPE_TRAIN'))\n", "\n", "print(f\"Loading the data took: {stopwatch}\")" @@ -145,20 +144,14 @@ "outputs": [], "source": [ "import pickle\n", - "from exasol.bucketfs import Service\n", + "from exasol.connections import open_bucketfs_connection\n", "\n", "MODEL_FILE = 'telescope_tree_model.pkl'\n", "\n", - "# Setup the connection parameters.\n", - "buckfs_url_prefix = 'https' if sb_config.BUCKETFS_USE_HTTPS == 'True' else 'http'\n", - "buckfs_url = f'{buckfs_url_prefix}://{sb_config.EXTERNAL_HOST_NAME}:{sb_config.BUCKETFS_PORT}'\n", - "buckfs_credentials = {sb_config.BUCKETFS_BUCKET: {'username': sb_config.BUCKETFS_USER, 'password': sb_config.BUCKETFS_PASSWORD}}\n", - "\n", "stopwatch = Stopwatch()\n", "\n", - "# Connect to the BucketFS service and navigate to the bucket of choice.\n", - "bucketfs = Service(buckfs_url, buckfs_credentials)\n", - "bucket = bucketfs[sb_config.BUCKETFS_BUCKET]\n", + "# Connect to the BucketFS service\n", + "bucket = open_bucketfs_connection(sb_config)\n", "\n", "# Serialize the model into a byte-array and upload it to the BucketFS, \n", "# where it will be saved in the file with the specified name.\n", @@ -174,24 +167,6 @@ "source": [ "Now we are ready to use this model in our SQL queries. This will be demonstrated in the [following notebook](sklearn_predict_telescope.ipynb)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0301211d-8520-4f66-8727-114f3292bcd6", - "metadata": {}, - "outputs": [], - "source": [ - "buckfs_url" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ec99ed24-d6ce-46bf-97f3-9d0b1c38aade", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {