Scikit-learn notebooks

exasol · Oct 4, 2023 · 091aa28 · 091aa28
1 parent 399467f
commit 091aa28
Show file tree

Hide file tree

Showing 8 changed files with 1,827 additions and 0 deletions.
diff --git a/doc/tutorials/data/data_abalone.ipynb b/doc/tutorials/data/data_abalone.ipynb
@@ -0,0 +1,214 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "30001487-61d5-46be-a662-83f406d8cc2a",
+   "metadata": {},
+   "source": [
+    "Here we will load data of physical measurements of abalones (sea snails). <a href=\"https://archive.ics.uci.edu/dataset/1/abalone\" target=\"_blank\" rel=\"noopener\">Follow this link</a> to get details about this dataset.\n",
+    "\n",
+    "To execute queries and upload data to Exasol database we will be using the <a href=\"https://github.com/exasol/pyexasol\" target=\"_blank\" rel=\"noopener\">`pyexasol`</a> module.\n",
+    "\n",
+    "Prior to using this notebook one needs to [create the database schema](../setup_db.ipynb)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7d1fd009-42ca-4b16-929a-d00d284e2e1f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO: Move this to a separate configuration notebook. Here we just need to load this configuration from a store.\n",
+    "EXASOL_EXTERNAL_HOST_NAME = \"192.168.124.93\"\n",
+    "EXASOL_HOST_PORT = \"8888\"\n",
+    "EXASOL_EXTERNAL_HOST = f\"\"\"{EXASOL_EXTERNAL_HOST_NAME}:{EXASOL_HOST_PORT}\"\"\"\n",
+    "EXASOL_USER = \"sys\"\n",
+    "EXASOL_PASSWORD = \"exasol\"\n",
+    "EXASOL_BUCKETFS_PORT = \"6666\"\n",
+    "EXASOL_EXTERNAL_BUCKETFS_HOST = f\"\"\"{EXASOL_EXTERNAL_HOST_NAME}:{EXASOL_BUCKETFS_PORT}\"\"\"\n",
+    "EXASOL_BUCKETFS_USER = \"w\"\n",
+    "EXASOL_BUCKETFS_PASSWORD = \"write\"\n",
+    "EXASOL_BUCKETFS_USE_HTTPS = False\n",
+    "EXASOL_BUCKETFS_URL_PREFIX = \"https://\" if EXASOL_BUCKETFS_USE_HTTPS else \"http://\"\n",
+    "EXASOL_BUCKETFS_SERVICE = \"bfsdefault\"\n",
+    "EXASOL_BUCKETFS_BUCKET = \"default\"\n",
+    "EXASOL_BUCKETFS_PATH = f\"/buckets/{EXASOL_BUCKETFS_SERVICE}/{EXASOL_BUCKETFS_BUCKET}\" # Filesystem-Path to the read-only mounted BucketFS inside the running UDF Container\n",
+    "EXASOL_SCRIPT_LANGUAGE_NAME = \"PYTHON3_60\"\n",
+    "EXASOL_UDF_FLAVOR = \"python3-ds-EXASOL-6.0.0\"\n",
+    "EXASOL_UDF_RELEASE= \"20190116\"\n",
+    "EXASOL_UDF_CLIENT = \"exaudfclient\" # or for newer versions of the flavor exaudfclient_py3\n",
+    "EXASOL_SCRIPT_LANGUAGES = f\"{EXASOL_SCRIPT_LANGUAGE_NAME}=localzmq+protobuf:///{EXASOL_BUCKETFS_SERVICE}/{EXASOL_BUCKETFS_BUCKET}/{EXASOL_UDF_FLAVOR}?lang=python#buckets/{EXASOL_BUCKETFS_SERVICE}/{EXASOL_BUCKETFS_BUCKET}/{EXASOL_UDF_FLAVOR}/exaudf/{EXASOL_UDF_CLIENT}\";\n",
+    "EXASOL_SCHEMA = \"IDA\"\n",
+    "\n",
+    "connection_params = {\"dns\": EXASOL_EXTERNAL_HOST, \"user\": EXASOL_USER, \"password\": EXASOL_PASSWORD, \"compression\": True}\n",
+    "\n",
+    "params = {\n",
+    "    \"script_languages\": EXASOL_SCRIPT_LANGUAGES,\n",
+    "    \"script_language_name\": EXASOL_SCRIPT_LANGUAGE_NAME,\n",
+    "    \"schema\": EXASOL_SCHEMA,\n",
+    "    \"EXASOL_BUCKETFS_PORT\": EXASOL_BUCKETFS_PORT,\n",
+    "    \"EXASOL_BUCKETFS_USER\": EXASOL_BUCKETFS_USER,\n",
+    "    \"EXASOL_BUCKETFS_PASSWORD\": EXASOL_BUCKETFS_PASSWORD,\n",
+    "    \"EXASOL_BUCKETFS_USE_HTTPS\": EXASOL_BUCKETFS_USE_HTTPS,\n",
+    "    \"EXASOL_BUCKETFS_BUCKET\": EXASOL_BUCKETFS_BUCKET,\n",
+    "    \"EXASOL_BUCKETFS_PATH\": EXASOL_BUCKETFS_PATH\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2b97740f-c9b6-40f8-a9c7-3ddcd08e0898",
+   "metadata": {},
+   "source": [
+    "First we will load the data into pandas DataFrame.\n",
+    "We will name the column as per their description (see Variable Table in the dataset description)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "863b0429-1fe3-46ae-9569-a08c11c78464",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading the data took: 922.25ms\n",
+      "  Sex  Length  Diameter  Height  Whole_weight  Shucked_weight  Viscera_weight  \\\n",
+      "0   M   0.350     0.265   0.090        0.2255          0.0995          0.0485   \n",
+      "1   F   0.530     0.420   0.135        0.6770          0.2565          0.1415   \n",
+      "2   M   0.440     0.365   0.125        0.5160          0.2155          0.1140   \n",
+      "3   I   0.330     0.255   0.080        0.2050          0.0895          0.0395   \n",
+      "4   I   0.425     0.300   0.095        0.3515          0.1410          0.0775   \n",
+      "\n",
+      "   Shell_weight  Rings  \n",
+      "0         0.070      7  \n",
+      "1         0.210      9  \n",
+      "2         0.155     10  \n",
+      "3         0.055      7  \n",
+      "4         0.120      8  \n"
+     ]
+    }
+   ],
+   "source": [
+    "from urllib.request import urlopen\n",
+    "import tempfile\n",
+    "from zipfile import ZipFile\n",
+    "import pandas as pd\n",
+    "from stopwatch import Stopwatch\n",
+    "\n",
+    "stopwatch = Stopwatch()\n",
+    "\n",
+    "DATA_URL = \"https://archive.ics.uci.edu/static/public/1/abalone.zip\"\n",
+    "DATA_FILE = \"abalone.data\"\n",
+    "\n",
+    "resp = urlopen(DATA_URL)\n",
+    "with tempfile.TemporaryFile() as f:\n",
+    "    f.write(resp.read())\n",
+    "    print(f\"Downloading the data took: {stopwatch}\")\n",
+    "\n",
+    "    f.seek(0)\n",
+    "    with ZipFile(f) as z:\n",
+    "        with z.open(DATA_FILE, \"r\") as f:\n",
+    "            df = pd.read_csv(f)\n",
+    "\n",
+    "column_def = [\n",
+    "    ('Sex', 'CHAR(1)'),\t                 # M, F, and I (infant)\n",
+    "    ('Length', 'DECIMAL(4,3)'),          # longest shell measurement (mm)\n",
+    "    ('Diameter', 'DECIMAL(4,3)'),\t     # perpendicular to length (mm)\n",
+    "    ('Height', 'DECIMAL(4,3)'),          # with meat in shell (mm)\n",
+    "    ('Whole_weight', 'DECIMAL(5,4)'),    # whole abalone (grams)\n",
+    "    ('Shucked_weight', 'DECIMAL(5,4)'),  # weight of meat (grams)\n",
+    "    ('Viscera_weight', 'DECIMAL(5,4)'),  # gut weight (after bleeding) (grams)\n",
+    "    ('Shell_weight', 'DECIMAL(4,3)'),    # after being dried (grams)\n",
+    "    ('Rings', 'INT')                     # +1.5 gives the age in years\n",
+    "]\n",
+    "df.columns = [c[0] for c in column_def]\n",
+    "\n",
+    "print(df.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "916ba87d-57f4-44f6-a0f8-8c5556afb719",
+   "metadata": {},
+   "source": [
+    "Let's split data randomly into train and test sets. We will then create two tables - ABALONE_TRAIN and ABALONE_TEST - and load the datasets into these tables."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "547907a7-b437-4ff4-8ab8-08b55e0dcc88",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Imported 3340 rows into TRAIN.\n",
+      "Imported 836 rows into TEST.\n",
+      "Importing the data took: 470.00ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "import pyexasol\n",
+    "\n",
+    "# Split the data into train and test sets\n",
+    "df_train, df_test = train_test_split(df, test_size=0.2)\n",
+    "\n",
+    "column_desc = [' '.join(c) for c in column_def]\n",
+    "\n",
+    "stopwatch = Stopwatch()\n",
+    "\n",
+    "# Create Exasol connection\n",
+    "with pyexasol.connect(dsn=EXASOL_EXTERNAL_HOST, user=EXASOL_USER, password=EXASOL_PASSWORD, compression=True) as conn:\n",
+    "\n",
+    "    # Create tables\n",
+    "    conn.execute(query='CREATE OR REPLACE TABLE {schema!i}.ABALONE_TRAIN(' + ', '.join(column_desc) + ')', query_params=params)\n",
+    "    conn.execute(query='CREATE OR REPLACE TABLE {schema!i}.ABALONE_TEST LIKE {schema!i}.ABALONE_TRAIN', query_params=params)\n",
+    "\n",
+    "    # Import data into Exasol\n",
+    "    conn.import_from_pandas(df_train, (EXASOL_SCHEMA, \"ABALONE_TRAIN\"))\n",
+    "    print(f\"Imported {conn.last_statement().rowcount()} rows into TRAIN.\")\n",
+    "    conn.import_from_pandas(df_test, (EXASOL_SCHEMA, \"ABALONE_TEST\"))\n",
+    "    print(f\"Imported {conn.last_statement().rowcount()} rows into TEST.\")\n",
+    "\n",
+    "print(f\"Importing the data took: {stopwatch}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc0456f4-7e8f-4942-8653-c81f71508291",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}