Skip to content

Commit

Permalink
Scikit-learn notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
ahsimb committed Oct 4, 2023
1 parent 399467f commit 091aa28
Show file tree
Hide file tree
Showing 8 changed files with 1,827 additions and 0 deletions.
214 changes: 214 additions & 0 deletions doc/tutorials/data/data_abalone.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "30001487-61d5-46be-a662-83f406d8cc2a",
"metadata": {},
"source": [
"Here we will load data of physical measurements of abalones (sea snails). <a href=\"https://archive.ics.uci.edu/dataset/1/abalone\" target=\"_blank\" rel=\"noopener\">Follow this link</a> to get details about this dataset.\n",
"\n",
"To execute queries and upload data to Exasol database we will be using the <a href=\"https://github.com/exasol/pyexasol\" target=\"_blank\" rel=\"noopener\">`pyexasol`</a> module.\n",
"\n",
"Prior to using this notebook one needs to [create the database schema](../setup_db.ipynb)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7d1fd009-42ca-4b16-929a-d00d284e2e1f",
"metadata": {},
"outputs": [],
"source": [
"# TODO: Move this to a separate configuration notebook. Here we just need to load this configuration from a store.\n",
"EXASOL_EXTERNAL_HOST_NAME = \"192.168.124.93\"\n",
"EXASOL_HOST_PORT = \"8888\"\n",
"EXASOL_EXTERNAL_HOST = f\"\"\"{EXASOL_EXTERNAL_HOST_NAME}:{EXASOL_HOST_PORT}\"\"\"\n",
"EXASOL_USER = \"sys\"\n",
"EXASOL_PASSWORD = \"exasol\"\n",
"EXASOL_BUCKETFS_PORT = \"6666\"\n",
"EXASOL_EXTERNAL_BUCKETFS_HOST = f\"\"\"{EXASOL_EXTERNAL_HOST_NAME}:{EXASOL_BUCKETFS_PORT}\"\"\"\n",
"EXASOL_BUCKETFS_USER = \"w\"\n",
"EXASOL_BUCKETFS_PASSWORD = \"write\"\n",
"EXASOL_BUCKETFS_USE_HTTPS = False\n",
"EXASOL_BUCKETFS_URL_PREFIX = \"https://\" if EXASOL_BUCKETFS_USE_HTTPS else \"http://\"\n",
"EXASOL_BUCKETFS_SERVICE = \"bfsdefault\"\n",
"EXASOL_BUCKETFS_BUCKET = \"default\"\n",
"EXASOL_BUCKETFS_PATH = f\"/buckets/{EXASOL_BUCKETFS_SERVICE}/{EXASOL_BUCKETFS_BUCKET}\" # Filesystem-Path to the read-only mounted BucketFS inside the running UDF Container\n",
"EXASOL_SCRIPT_LANGUAGE_NAME = \"PYTHON3_60\"\n",
"EXASOL_UDF_FLAVOR = \"python3-ds-EXASOL-6.0.0\"\n",
"EXASOL_UDF_RELEASE= \"20190116\"\n",
"EXASOL_UDF_CLIENT = \"exaudfclient\" # or for newer versions of the flavor exaudfclient_py3\n",
"EXASOL_SCRIPT_LANGUAGES = f\"{EXASOL_SCRIPT_LANGUAGE_NAME}=localzmq+protobuf:///{EXASOL_BUCKETFS_SERVICE}/{EXASOL_BUCKETFS_BUCKET}/{EXASOL_UDF_FLAVOR}?lang=python#buckets/{EXASOL_BUCKETFS_SERVICE}/{EXASOL_BUCKETFS_BUCKET}/{EXASOL_UDF_FLAVOR}/exaudf/{EXASOL_UDF_CLIENT}\";\n",
"EXASOL_SCHEMA = \"IDA\"\n",
"\n",
"connection_params = {\"dns\": EXASOL_EXTERNAL_HOST, \"user\": EXASOL_USER, \"password\": EXASOL_PASSWORD, \"compression\": True}\n",
"\n",
"params = {\n",
" \"script_languages\": EXASOL_SCRIPT_LANGUAGES,\n",
" \"script_language_name\": EXASOL_SCRIPT_LANGUAGE_NAME,\n",
" \"schema\": EXASOL_SCHEMA,\n",
" \"EXASOL_BUCKETFS_PORT\": EXASOL_BUCKETFS_PORT,\n",
" \"EXASOL_BUCKETFS_USER\": EXASOL_BUCKETFS_USER,\n",
" \"EXASOL_BUCKETFS_PASSWORD\": EXASOL_BUCKETFS_PASSWORD,\n",
" \"EXASOL_BUCKETFS_USE_HTTPS\": EXASOL_BUCKETFS_USE_HTTPS,\n",
" \"EXASOL_BUCKETFS_BUCKET\": EXASOL_BUCKETFS_BUCKET,\n",
" \"EXASOL_BUCKETFS_PATH\": EXASOL_BUCKETFS_PATH\n",
"}"
]
},
{
"cell_type": "markdown",
"id": "2b97740f-c9b6-40f8-a9c7-3ddcd08e0898",
"metadata": {},
"source": [
"First we will load the data into pandas DataFrame.\n",
"We will name the column as per their description (see Variable Table in the dataset description)."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "863b0429-1fe3-46ae-9569-a08c11c78464",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading the data took: 922.25ms\n",
" Sex Length Diameter Height Whole_weight Shucked_weight Viscera_weight \\\n",
"0 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 \n",
"1 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 \n",
"2 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 \n",
"3 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 \n",
"4 I 0.425 0.300 0.095 0.3515 0.1410 0.0775 \n",
"\n",
" Shell_weight Rings \n",
"0 0.070 7 \n",
"1 0.210 9 \n",
"2 0.155 10 \n",
"3 0.055 7 \n",
"4 0.120 8 \n"
]
}
],
"source": [
"from urllib.request import urlopen\n",
"import tempfile\n",
"from zipfile import ZipFile\n",
"import pandas as pd\n",
"from stopwatch import Stopwatch\n",
"\n",
"stopwatch = Stopwatch()\n",
"\n",
"DATA_URL = \"https://archive.ics.uci.edu/static/public/1/abalone.zip\"\n",
"DATA_FILE = \"abalone.data\"\n",
"\n",
"resp = urlopen(DATA_URL)\n",
"with tempfile.TemporaryFile() as f:\n",
" f.write(resp.read())\n",
" print(f\"Downloading the data took: {stopwatch}\")\n",
"\n",
" f.seek(0)\n",
" with ZipFile(f) as z:\n",
" with z.open(DATA_FILE, \"r\") as f:\n",
" df = pd.read_csv(f)\n",
"\n",
"column_def = [\n",
" ('Sex', 'CHAR(1)'),\t # M, F, and I (infant)\n",
" ('Length', 'DECIMAL(4,3)'), # longest shell measurement (mm)\n",
" ('Diameter', 'DECIMAL(4,3)'),\t # perpendicular to length (mm)\n",
" ('Height', 'DECIMAL(4,3)'), # with meat in shell (mm)\n",
" ('Whole_weight', 'DECIMAL(5,4)'), # whole abalone (grams)\n",
" ('Shucked_weight', 'DECIMAL(5,4)'), # weight of meat (grams)\n",
" ('Viscera_weight', 'DECIMAL(5,4)'), # gut weight (after bleeding) (grams)\n",
" ('Shell_weight', 'DECIMAL(4,3)'), # after being dried (grams)\n",
" ('Rings', 'INT') # +1.5 gives the age in years\n",
"]\n",
"df.columns = [c[0] for c in column_def]\n",
"\n",
"print(df.head())"
]
},
{
"cell_type": "markdown",
"id": "916ba87d-57f4-44f6-a0f8-8c5556afb719",
"metadata": {},
"source": [
"Let's split data randomly into train and test sets. We will then create two tables - ABALONE_TRAIN and ABALONE_TEST - and load the datasets into these tables."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "547907a7-b437-4ff4-8ab8-08b55e0dcc88",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Imported 3340 rows into TRAIN.\n",
"Imported 836 rows into TEST.\n",
"Importing the data took: 470.00ms\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"import pyexasol\n",
"\n",
"# Split the data into train and test sets\n",
"df_train, df_test = train_test_split(df, test_size=0.2)\n",
"\n",
"column_desc = [' '.join(c) for c in column_def]\n",
"\n",
"stopwatch = Stopwatch()\n",
"\n",
"# Create Exasol connection\n",
"with pyexasol.connect(dsn=EXASOL_EXTERNAL_HOST, user=EXASOL_USER, password=EXASOL_PASSWORD, compression=True) as conn:\n",
"\n",
" # Create tables\n",
" conn.execute(query='CREATE OR REPLACE TABLE {schema!i}.ABALONE_TRAIN(' + ', '.join(column_desc) + ')', query_params=params)\n",
" conn.execute(query='CREATE OR REPLACE TABLE {schema!i}.ABALONE_TEST LIKE {schema!i}.ABALONE_TRAIN', query_params=params)\n",
"\n",
" # Import data into Exasol\n",
" conn.import_from_pandas(df_train, (EXASOL_SCHEMA, \"ABALONE_TRAIN\"))\n",
" print(f\"Imported {conn.last_statement().rowcount()} rows into TRAIN.\")\n",
" conn.import_from_pandas(df_test, (EXASOL_SCHEMA, \"ABALONE_TEST\"))\n",
" print(f\"Imported {conn.last_statement().rowcount()} rows into TEST.\")\n",
"\n",
"print(f\"Importing the data took: {stopwatch}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bc0456f4-7e8f-4942-8653-c81f71508291",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 091aa28

Please sign in to comment.