-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
1,827 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,214 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "30001487-61d5-46be-a662-83f406d8cc2a", | ||
"metadata": {}, | ||
"source": [ | ||
"Here we will load data of physical measurements of abalones (sea snails). <a href=\"https://archive.ics.uci.edu/dataset/1/abalone\" target=\"_blank\" rel=\"noopener\">Follow this link</a> to get details about this dataset.\n", | ||
"\n", | ||
"To execute queries and upload data to Exasol database we will be using the <a href=\"https://github.com/exasol/pyexasol\" target=\"_blank\" rel=\"noopener\">`pyexasol`</a> module.\n", | ||
"\n", | ||
"Prior to using this notebook one needs to [create the database schema](../setup_db.ipynb)." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "7d1fd009-42ca-4b16-929a-d00d284e2e1f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# TODO: Move this to a separate configuration notebook. Here we just need to load this configuration from a store.\n", | ||
"EXASOL_EXTERNAL_HOST_NAME = \"192.168.124.93\"\n", | ||
"EXASOL_HOST_PORT = \"8888\"\n", | ||
"EXASOL_EXTERNAL_HOST = f\"\"\"{EXASOL_EXTERNAL_HOST_NAME}:{EXASOL_HOST_PORT}\"\"\"\n", | ||
"EXASOL_USER = \"sys\"\n", | ||
"EXASOL_PASSWORD = \"exasol\"\n", | ||
"EXASOL_BUCKETFS_PORT = \"6666\"\n", | ||
"EXASOL_EXTERNAL_BUCKETFS_HOST = f\"\"\"{EXASOL_EXTERNAL_HOST_NAME}:{EXASOL_BUCKETFS_PORT}\"\"\"\n", | ||
"EXASOL_BUCKETFS_USER = \"w\"\n", | ||
"EXASOL_BUCKETFS_PASSWORD = \"write\"\n", | ||
"EXASOL_BUCKETFS_USE_HTTPS = False\n", | ||
"EXASOL_BUCKETFS_URL_PREFIX = \"https://\" if EXASOL_BUCKETFS_USE_HTTPS else \"http://\"\n", | ||
"EXASOL_BUCKETFS_SERVICE = \"bfsdefault\"\n", | ||
"EXASOL_BUCKETFS_BUCKET = \"default\"\n", | ||
"EXASOL_BUCKETFS_PATH = f\"/buckets/{EXASOL_BUCKETFS_SERVICE}/{EXASOL_BUCKETFS_BUCKET}\" # Filesystem-Path to the read-only mounted BucketFS inside the running UDF Container\n", | ||
"EXASOL_SCRIPT_LANGUAGE_NAME = \"PYTHON3_60\"\n", | ||
"EXASOL_UDF_FLAVOR = \"python3-ds-EXASOL-6.0.0\"\n", | ||
"EXASOL_UDF_RELEASE= \"20190116\"\n", | ||
"EXASOL_UDF_CLIENT = \"exaudfclient\" # or for newer versions of the flavor exaudfclient_py3\n", | ||
"EXASOL_SCRIPT_LANGUAGES = f\"{EXASOL_SCRIPT_LANGUAGE_NAME}=localzmq+protobuf:///{EXASOL_BUCKETFS_SERVICE}/{EXASOL_BUCKETFS_BUCKET}/{EXASOL_UDF_FLAVOR}?lang=python#buckets/{EXASOL_BUCKETFS_SERVICE}/{EXASOL_BUCKETFS_BUCKET}/{EXASOL_UDF_FLAVOR}/exaudf/{EXASOL_UDF_CLIENT}\";\n", | ||
"EXASOL_SCHEMA = \"IDA\"\n", | ||
"\n", | ||
"connection_params = {\"dns\": EXASOL_EXTERNAL_HOST, \"user\": EXASOL_USER, \"password\": EXASOL_PASSWORD, \"compression\": True}\n", | ||
"\n", | ||
"params = {\n", | ||
" \"script_languages\": EXASOL_SCRIPT_LANGUAGES,\n", | ||
" \"script_language_name\": EXASOL_SCRIPT_LANGUAGE_NAME,\n", | ||
" \"schema\": EXASOL_SCHEMA,\n", | ||
" \"EXASOL_BUCKETFS_PORT\": EXASOL_BUCKETFS_PORT,\n", | ||
" \"EXASOL_BUCKETFS_USER\": EXASOL_BUCKETFS_USER,\n", | ||
" \"EXASOL_BUCKETFS_PASSWORD\": EXASOL_BUCKETFS_PASSWORD,\n", | ||
" \"EXASOL_BUCKETFS_USE_HTTPS\": EXASOL_BUCKETFS_USE_HTTPS,\n", | ||
" \"EXASOL_BUCKETFS_BUCKET\": EXASOL_BUCKETFS_BUCKET,\n", | ||
" \"EXASOL_BUCKETFS_PATH\": EXASOL_BUCKETFS_PATH\n", | ||
"}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "2b97740f-c9b6-40f8-a9c7-3ddcd08e0898", | ||
"metadata": {}, | ||
"source": [ | ||
"First we will load the data into pandas DataFrame.\n", | ||
"We will name the column as per their description (see Variable Table in the dataset description)." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "863b0429-1fe3-46ae-9569-a08c11c78464", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Downloading the data took: 922.25ms\n", | ||
" Sex Length Diameter Height Whole_weight Shucked_weight Viscera_weight \\\n", | ||
"0 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 \n", | ||
"1 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 \n", | ||
"2 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 \n", | ||
"3 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 \n", | ||
"4 I 0.425 0.300 0.095 0.3515 0.1410 0.0775 \n", | ||
"\n", | ||
" Shell_weight Rings \n", | ||
"0 0.070 7 \n", | ||
"1 0.210 9 \n", | ||
"2 0.155 10 \n", | ||
"3 0.055 7 \n", | ||
"4 0.120 8 \n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from urllib.request import urlopen\n", | ||
"import tempfile\n", | ||
"from zipfile import ZipFile\n", | ||
"import pandas as pd\n", | ||
"from stopwatch import Stopwatch\n", | ||
"\n", | ||
"stopwatch = Stopwatch()\n", | ||
"\n", | ||
"DATA_URL = \"https://archive.ics.uci.edu/static/public/1/abalone.zip\"\n", | ||
"DATA_FILE = \"abalone.data\"\n", | ||
"\n", | ||
"resp = urlopen(DATA_URL)\n", | ||
"with tempfile.TemporaryFile() as f:\n", | ||
" f.write(resp.read())\n", | ||
" print(f\"Downloading the data took: {stopwatch}\")\n", | ||
"\n", | ||
" f.seek(0)\n", | ||
" with ZipFile(f) as z:\n", | ||
" with z.open(DATA_FILE, \"r\") as f:\n", | ||
" df = pd.read_csv(f)\n", | ||
"\n", | ||
"column_def = [\n", | ||
" ('Sex', 'CHAR(1)'),\t # M, F, and I (infant)\n", | ||
" ('Length', 'DECIMAL(4,3)'), # longest shell measurement (mm)\n", | ||
" ('Diameter', 'DECIMAL(4,3)'),\t # perpendicular to length (mm)\n", | ||
" ('Height', 'DECIMAL(4,3)'), # with meat in shell (mm)\n", | ||
" ('Whole_weight', 'DECIMAL(5,4)'), # whole abalone (grams)\n", | ||
" ('Shucked_weight', 'DECIMAL(5,4)'), # weight of meat (grams)\n", | ||
" ('Viscera_weight', 'DECIMAL(5,4)'), # gut weight (after bleeding) (grams)\n", | ||
" ('Shell_weight', 'DECIMAL(4,3)'), # after being dried (grams)\n", | ||
" ('Rings', 'INT') # +1.5 gives the age in years\n", | ||
"]\n", | ||
"df.columns = [c[0] for c in column_def]\n", | ||
"\n", | ||
"print(df.head())" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "916ba87d-57f4-44f6-a0f8-8c5556afb719", | ||
"metadata": {}, | ||
"source": [ | ||
"Let's split data randomly into train and test sets. We will then create two tables - ABALONE_TRAIN and ABALONE_TEST - and load the datasets into these tables." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "547907a7-b437-4ff4-8ab8-08b55e0dcc88", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Imported 3340 rows into TRAIN.\n", | ||
"Imported 836 rows into TEST.\n", | ||
"Importing the data took: 470.00ms\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from sklearn.model_selection import train_test_split\n", | ||
"import pyexasol\n", | ||
"\n", | ||
"# Split the data into train and test sets\n", | ||
"df_train, df_test = train_test_split(df, test_size=0.2)\n", | ||
"\n", | ||
"column_desc = [' '.join(c) for c in column_def]\n", | ||
"\n", | ||
"stopwatch = Stopwatch()\n", | ||
"\n", | ||
"# Create Exasol connection\n", | ||
"with pyexasol.connect(dsn=EXASOL_EXTERNAL_HOST, user=EXASOL_USER, password=EXASOL_PASSWORD, compression=True) as conn:\n", | ||
"\n", | ||
" # Create tables\n", | ||
" conn.execute(query='CREATE OR REPLACE TABLE {schema!i}.ABALONE_TRAIN(' + ', '.join(column_desc) + ')', query_params=params)\n", | ||
" conn.execute(query='CREATE OR REPLACE TABLE {schema!i}.ABALONE_TEST LIKE {schema!i}.ABALONE_TRAIN', query_params=params)\n", | ||
"\n", | ||
" # Import data into Exasol\n", | ||
" conn.import_from_pandas(df_train, (EXASOL_SCHEMA, \"ABALONE_TRAIN\"))\n", | ||
" print(f\"Imported {conn.last_statement().rowcount()} rows into TRAIN.\")\n", | ||
" conn.import_from_pandas(df_test, (EXASOL_SCHEMA, \"ABALONE_TEST\"))\n", | ||
" print(f\"Imported {conn.last_statement().rowcount()} rows into TEST.\")\n", | ||
"\n", | ||
"print(f\"Importing the data took: {stopwatch}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "bc0456f4-7e8f-4942-8653-c81f71508291", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.10" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.