Skip to content

Commit

Permalink
Updated sklearn notebooks (#86)
Browse files Browse the repository at this point in the history
Co-authored-by: Torsten Kilias <[email protected]>
  • Loading branch information
ahsimb and tkilias authored Nov 30, 2023
1 parent a33b19b commit 3c57dc4
Show file tree
Hide file tree
Showing 7 changed files with 25 additions and 71 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -106,21 +106,19 @@
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"import pyexasol\n",
"from exasol.connections import open_pyexasol_connection\n",
"\n",
"# Split the data into train and test sets\n",
"df_train, df_test = train_test_split(df, test_size=0.2)\n",
"\n",
"train_table = 'ABALONE_TRAIN'\n",
"test_table = 'ABALONE_TEST'\n",
"column_desc = [' '.join(c) for c in column_def]\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"\n",
"stopwatch = Stopwatch()\n",
"\n",
"# Create an Exasol connection\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
"\n",
" # Create tables\n",
" sql = f'CREATE OR REPLACE TABLE \"{sb_config.SCHEMA}\".\"{train_table}\"({\", \".join(column_desc)})'\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"import pyexasol\n",
"from exasol.connections import open_pyexasol_connection\n",
"\n",
"# Split the data into train and test sets\n",
"df_train, df_test = train_test_split(df, test_size=0.2)\n",
Expand All @@ -122,8 +122,7 @@
"stopwatch = Stopwatch()\n",
"\n",
"# Create an Exasol connection\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
"\n",
" # Create tables\n",
" sql = f'CREATE OR REPLACE TABLE \"{sb_config.SCHEMA}\".\"{train_table}\"({\", \".join(column_desc)})'\n",
Expand All @@ -143,15 +142,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "3490c957-366e-425f-91ae-a645ccabbfe0",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "e53cbb99-8387-4165-a930-e4f24abfeaee",
"id": "6645e76c-6a6e-48f3-a668-c1fd8717d7f2",
"metadata": {},
"outputs": [],
"source": []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,16 @@
"metadata": {},
"outputs": [],
"source": [
"import pyexasol\n",
"from exasol.connections import open_pyexasol_connection, get_udf_bucket_path\n",
"from stopwatch import Stopwatch\n",
"\n",
"target_column = 'RINGS'\n",
"bfs_model_path = f'/buckets/{sb_config.BUCKETFS_SERVICE}/{sb_config.BUCKETFS_BUCKET}/abalone_svm_model.pkl'\n",
"bfs_model_path = get_udf_bucket_path(sb_config) + '/abalone_svm_model.pkl'\n",
"params = {'schema': sb_config.SCHEMA, 'test_table': 'ABALONE_TEST', 'model_path': bfs_model_path}\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"\n",
"stopwatch = Stopwatch()\n",
"\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
" # Get the list of feature columns\n",
" sql = 'SELECT * FROM {schema!q}.{test_table!q} LIMIT 1'\n",
" df_tmp = conn.export_to_pandas(query_or_table=sql, query_params=params)\n",
Expand Down Expand Up @@ -98,7 +97,7 @@
"import matplotlib.pyplot as plt\n",
"\n",
"# Get the ground truth labels for the test set.\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
" sql = f'SELECT ROWID AS [sample_id], [{target_column}] FROM {{schema!q}}.{{test_table!q}}'\n",
" df_true = conn.export_to_pandas(query_or_table=sql, query_params=params)\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,16 @@
"metadata": {},
"outputs": [],
"source": [
"import pyexasol\n",
"from exasol.connections import open_pyexasol_connection, get_udf_bucket_path\n",
"from stopwatch import Stopwatch\n",
"\n",
"target_column = 'CLASS'\n",
"bfs_model_path = f'/buckets/{sb_config.BUCKETFS_SERVICE}/{sb_config.BUCKETFS_BUCKET}/telescope_tree_model.pkl'\n",
"bfs_model_path = get_udf_bucket_path(sb_config) + '/telescope_tree_model.pkl'\n",
"params = {'schema': sb_config.SCHEMA, 'test_table': 'TELESCOPE_TEST', 'model_path': bfs_model_path}\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"\n",
"stopwatch = Stopwatch()\n",
"\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
" # Get the list of feature columns\n",
" sql = 'SELECT * FROM {schema!i}.{test_table!i} LIMIT 1'\n",
" df_tmp = conn.export_to_pandas(query_or_table=sql, query_params=params)\n",
Expand Down Expand Up @@ -98,7 +97,7 @@
"import matplotlib.pyplot as plt\n",
"\n",
"# Get the ground truth labels for the test set.\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
" sql = f'SELECT ROWID AS [sample_id], [{target_column}] FROM {{schema!q}}.{{test_table!q}}'\n",
" df_true = conn.export_to_pandas(query_or_table=sql, query_params=params)\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
"outputs": [],
"source": [
"import textwrap\n",
"import pyexasol\n",
"from exasol.connections import open_pyexasol_connection\n",
"from stopwatch import Stopwatch\n",
"\n",
"stopwatch = Stopwatch()\n",
Expand Down Expand Up @@ -104,8 +104,7 @@
"/\n",
"\"\"\")\n",
"\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
" conn.execute(query=sql, query_params={'schema': sb_config.SCHEMA})\n",
"\n",
"print(f\"Creating prediction script took: {stopwatch}\")"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,13 @@
"metadata": {},
"outputs": [],
"source": [
"import pyexasol\n",
"from exasol.connections import open_pyexasol_connection\n",
"from sklearn.model_selection import train_test_split\n",
"from stopwatch import Stopwatch\n",
"\n",
"stopwatch = Stopwatch()\n",
"\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
" df = conn.export_to_pandas(query_or_table=(sb_config.SCHEMA, 'ABALONE_TRAIN'))\n",
"\n",
"X, y = df.drop(columns='RINGS'), df['RINGS']\n",
Expand Down Expand Up @@ -199,20 +198,14 @@
"outputs": [],
"source": [
"import pickle\n",
"from exasol.bucketfs import Service\n",
"from exasol.connections import open_bucketfs_connection\n",
"\n",
"MODEL_FILE = 'abalone_svm_model.pkl'\n",
"\n",
"# Setup the connection parameters.\n",
"buckfs_url_prefix = 'https' if sb_config.BUCKETFS_USE_HTTPS == 'True' else 'http'\n",
"buckfs_url = f'{buckfs_url_prefix}://{sb_config.EXTERNAL_HOST_NAME}:{sb_config.BUCKETFS_PORT}'\n",
"buckfs_credentials = {sb_config.BUCKETFS_BUCKET: {'username': sb_config.BUCKETFS_USER, 'password': sb_config.BUCKETFS_PASSWORD}}\n",
"\n",
"stopwatch = Stopwatch()\n",
"\n",
"# Connect to the BucketFS service and navigate to the bucket of choice.\n",
"bucketfs = Service(buckfs_url, buckfs_credentials)\n",
"bucket = bucketfs[sb_config.BUCKETFS_BUCKET]\n",
"# Connect to the BucketFS service\n",
"bucket = open_bucketfs_connection(sb_config)\n",
"\n",
"# Serialize the model into a byte-array and upload it to the BucketFS, \n",
"# where it will be saved in the file with the specified name.\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,12 @@
"metadata": {},
"outputs": [],
"source": [
"import pyexasol\n",
"from exasol.connections import open_pyexasol_connection\n",
"from stopwatch import Stopwatch\n",
"\n",
"stopwatch = Stopwatch()\n",
"\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
" df = conn.export_to_pandas(query_or_table=(sb_config.SCHEMA, 'TELESCOPE_TRAIN'))\n",
"\n",
"print(f\"Loading the data took: {stopwatch}\")"
Expand Down Expand Up @@ -145,20 +144,14 @@
"outputs": [],
"source": [
"import pickle\n",
"from exasol.bucketfs import Service\n",
"from exasol.connections import open_bucketfs_connection\n",
"\n",
"MODEL_FILE = 'telescope_tree_model.pkl'\n",
"\n",
"# Setup the connection parameters.\n",
"buckfs_url_prefix = 'https' if sb_config.BUCKETFS_USE_HTTPS == 'True' else 'http'\n",
"buckfs_url = f'{buckfs_url_prefix}://{sb_config.EXTERNAL_HOST_NAME}:{sb_config.BUCKETFS_PORT}'\n",
"buckfs_credentials = {sb_config.BUCKETFS_BUCKET: {'username': sb_config.BUCKETFS_USER, 'password': sb_config.BUCKETFS_PASSWORD}}\n",
"\n",
"stopwatch = Stopwatch()\n",
"\n",
"# Connect to the BucketFS service and navigate to the bucket of choice.\n",
"bucketfs = Service(buckfs_url, buckfs_credentials)\n",
"bucket = bucketfs[sb_config.BUCKETFS_BUCKET]\n",
"# Connect to the BucketFS service\n",
"bucket = open_bucketfs_connection(sb_config)\n",
"\n",
"# Serialize the model into a byte-array and upload it to the BucketFS, \n",
"# where it will be saved in the file with the specified name.\n",
Expand All @@ -174,24 +167,6 @@
"source": [
"Now we are ready to use this model in our SQL queries. This will be demonstrated in the [following notebook](sklearn_predict_telescope.ipynb)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0301211d-8520-4f66-8727-114f3292bcd6",
"metadata": {},
"outputs": [],
"source": [
"buckfs_url"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ec99ed24-d6ce-46bf-97f3-9d0b1c38aade",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit 3c57dc4

Please sign in to comment.