Skip to content

Commit

Permalink
Addressed review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
ahsimb committed Oct 4, 2023
1 parent 091aa28 commit 40c304f
Show file tree
Hide file tree
Showing 8 changed files with 647 additions and 395 deletions.
132 changes: 83 additions & 49 deletions doc/tutorials/data/data_abalone.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14,46 +14,76 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"id": "7d1fd009-42ca-4b16-929a-d00d284e2e1f",
"metadata": {},
"outputs": [],
"source": [
"# TODO: Move this to a separate configuration notebook. Here we just need to load this configuration from a store.\n",
"EXASOL_EXTERNAL_HOST_NAME = \"192.168.124.93\"\n",
"EXASOL_HOST_PORT = \"8888\"\n",
"EXASOL_EXTERNAL_HOST = f\"\"\"{EXASOL_EXTERNAL_HOST_NAME}:{EXASOL_HOST_PORT}\"\"\"\n",
"EXASOL_USER = \"sys\"\n",
"EXASOL_PASSWORD = \"exasol\"\n",
"EXASOL_BUCKETFS_PORT = \"6666\"\n",
"EXASOL_EXTERNAL_BUCKETFS_HOST = f\"\"\"{EXASOL_EXTERNAL_HOST_NAME}:{EXASOL_BUCKETFS_PORT}\"\"\"\n",
"EXASOL_BUCKETFS_USER = \"w\"\n",
"EXASOL_BUCKETFS_PASSWORD = \"write\"\n",
"EXASOL_BUCKETFS_USE_HTTPS = False\n",
"EXASOL_BUCKETFS_URL_PREFIX = \"https://\" if EXASOL_BUCKETFS_USE_HTTPS else \"http://\"\n",
"EXASOL_BUCKETFS_SERVICE = \"bfsdefault\"\n",
"EXASOL_BUCKETFS_BUCKET = \"default\"\n",
"EXASOL_BUCKETFS_PATH = f\"/buckets/{EXASOL_BUCKETFS_SERVICE}/{EXASOL_BUCKETFS_BUCKET}\" # Filesystem-Path to the read-only mounted BucketFS inside the running UDF Container\n",
"EXASOL_SCRIPT_LANGUAGE_NAME = \"PYTHON3_60\"\n",
"EXASOL_UDF_FLAVOR = \"python3-ds-EXASOL-6.0.0\"\n",
"EXASOL_UDF_RELEASE= \"20190116\"\n",
"EXASOL_UDF_CLIENT = \"exaudfclient\" # or for newer versions of the flavor exaudfclient_py3\n",
"EXASOL_SCRIPT_LANGUAGES = f\"{EXASOL_SCRIPT_LANGUAGE_NAME}=localzmq+protobuf:///{EXASOL_BUCKETFS_SERVICE}/{EXASOL_BUCKETFS_BUCKET}/{EXASOL_UDF_FLAVOR}?lang=python#buckets/{EXASOL_BUCKETFS_SERVICE}/{EXASOL_BUCKETFS_BUCKET}/{EXASOL_UDF_FLAVOR}/exaudf/{EXASOL_UDF_CLIENT}\";\n",
"EXASOL_SCHEMA = \"IDA\"\n",
"\n",
"connection_params = {\"dns\": EXASOL_EXTERNAL_HOST, \"user\": EXASOL_USER, \"password\": EXASOL_PASSWORD, \"compression\": True}\n",
"\n",
"params = {\n",
" \"script_languages\": EXASOL_SCRIPT_LANGUAGES,\n",
" \"script_language_name\": EXASOL_SCRIPT_LANGUAGE_NAME,\n",
" \"schema\": EXASOL_SCHEMA,\n",
" \"EXASOL_BUCKETFS_PORT\": EXASOL_BUCKETFS_PORT,\n",
" \"EXASOL_BUCKETFS_USER\": EXASOL_BUCKETFS_USER,\n",
" \"EXASOL_BUCKETFS_PASSWORD\": EXASOL_BUCKETFS_PASSWORD,\n",
" \"EXASOL_BUCKETFS_USE_HTTPS\": EXASOL_BUCKETFS_USE_HTTPS,\n",
" \"EXASOL_BUCKETFS_BUCKET\": EXASOL_BUCKETFS_BUCKET,\n",
" \"EXASOL_BUCKETFS_PATH\": EXASOL_BUCKETFS_PATH\n",
"}"
"from dataclasses import dataclass\n",
"\n",
"@dataclass\n",
"class SandboxConfig:\n",
" EXTERNAL_HOST_NAME = \"192.168.124.93\"\n",
" HOST_PORT = \"8888\"\n",
"\n",
" @property\n",
" def EXTERNAL_HOST(self):\n",
" return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.HOST_PORT}\"\"\"\n",
"\n",
" USER = \"sys\"\n",
" PASSWORD = \"exasol\"\n",
" BUCKETFS_PORT = \"6666\"\n",
" BUCKETFS_USER = \"w\"\n",
" BUCKETFS_PASSWORD = \"write\"\n",
" BUCKETFS_USE_HTTPS = False\n",
" BUCKETFS_SERVICE = \"bfsdefault\"\n",
" BUCKETFS_BUCKET = \"default\"\n",
"\n",
" @property\n",
" def EXTERNAL_BUCKETFS_HOST(self):\n",
" return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.BUCKETFS_PORT}\"\"\"\n",
"\n",
" @property\n",
" def BUCKETFS_URL_PREFIX(self):\n",
" return \"https://\" if self.BUCKETFS_USE_HTTPS else \"http://\"\n",
"\n",
" @property\n",
" def BUCKETFS_PATH(self):\n",
" # Filesystem-Path to the read-only mounted BucketFS inside the running UDF Container\n",
" return f\"/buckets/{self.BUCKETFS_SERVICE}/{self.BUCKETFS_BUCKET}\"\n",
"\n",
" SCRIPT_LANGUAGE_NAME = \"PYTHON3_60\"\n",
" UDF_FLAVOR = \"python3-ds-EXASOL-6.0.0\"\n",
" UDF_RELEASE= \"20190116\"\n",
" UDF_CLIENT = \"exaudfclient\" # or for newer versions of the flavor exaudfclient_py3\n",
" SCHEMA = \"IDA\"\n",
"\n",
" @property\n",
" def SCRIPT_LANGUAGES(self):\n",
" return f\"\"\"{self.SCRIPT_LANGUAGE_NAME}=localzmq+protobuf:///{self.BUCKETFS_SERVICE}/\n",
" {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}?lang=python#buckets/{self.BUCKETFS_SERVICE}/\n",
" {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}/exaudf/{self.UDF_CLIENT}\"\"\";\n",
"\n",
" @property\n",
" def connection_params(self):\n",
" return {\"dns\": self.EXTERNAL_HOST, \"user\": self.USER, \"password\": self.PASSWORD, \"compression\": True}\n",
"\n",
" @property\n",
" def params(self):\n",
" return {\n",
" \"script_languages\": self.SCRIPT_LANGUAGES,\n",
" \"script_language_name\": self.SCRIPT_LANGUAGE_NAME,\n",
" \"schema\": self.SCHEMA,\n",
" \"BUCKETFS_PORT\": self.BUCKETFS_PORT,\n",
" \"BUCKETFS_USER\": self.BUCKETFS_USER,\n",
" \"BUCKETFS_PASSWORD\": self.BUCKETFS_PASSWORD,\n",
" \"BUCKETFS_USE_HTTPS\": self.BUCKETFS_USE_HTTPS,\n",
" \"BUCKETFS_BUCKET\": self.BUCKETFS_BUCKET,\n",
" \"BUCKETFS_PATH\": self.BUCKETFS_PATH\n",
" }\n",
"\n",
"conf = SandboxConfig()"
]
},
{
Expand All @@ -67,15 +97,15 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 10,
"id": "863b0429-1fe3-46ae-9569-a08c11c78464",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading the data took: 922.25ms\n",
"Downloading the data took: 928.85ms\n",
" Sex Length Diameter Height Whole_weight Shucked_weight Viscera_weight \\\n",
"0 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 \n",
"1 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 \n",
Expand All @@ -96,6 +126,7 @@
"from urllib.request import urlopen\n",
"import tempfile\n",
"from zipfile import ZipFile\n",
"from contextlib import ExitStack\n",
"import pandas as pd\n",
"from stopwatch import Stopwatch\n",
"\n",
Expand All @@ -105,14 +136,15 @@
"DATA_FILE = \"abalone.data\"\n",
"\n",
"resp = urlopen(DATA_URL)\n",
"with tempfile.TemporaryFile() as f:\n",
"with ExitStack() as stack:\n",
" f = stack.enter_context(tempfile.TemporaryFile())\n",
" f.write(resp.read())\n",
" print(f\"Downloading the data took: {stopwatch}\")\n",
"\n",
" f.seek(0)\n",
" with ZipFile(f) as z:\n",
" with z.open(DATA_FILE, \"r\") as f:\n",
" df = pd.read_csv(f)\n",
" z = stack.enter_context(ZipFile(f))\n",
" f = stack.enter_context(z.open(DATA_FILE, \"r\"))\n",
" df = pd.read_csv(f)\n",
"\n",
"column_def = [\n",
" ('Sex', 'CHAR(1)'),\t # M, F, and I (infant)\n",
Expand All @@ -125,7 +157,7 @@
" ('Shell_weight', 'DECIMAL(4,3)'), # after being dried (grams)\n",
" ('Rings', 'INT') # +1.5 gives the age in years\n",
"]\n",
"df.columns = [c[0] for c in column_def]\n",
"df.columns = [name for name, _ in column_def]\n",
"\n",
"print(df.head())"
]
Expand All @@ -140,7 +172,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 11,
"id": "547907a7-b437-4ff4-8ab8-08b55e0dcc88",
"metadata": {},
"outputs": [
Expand All @@ -150,7 +182,7 @@
"text": [
"Imported 3340 rows into TRAIN.\n",
"Imported 836 rows into TEST.\n",
"Importing the data took: 470.00ms\n"
"Importing the data took: 338.11ms\n"
]
}
],
Expand All @@ -166,16 +198,18 @@
"stopwatch = Stopwatch()\n",
"\n",
"# Create Exasol connection\n",
"with pyexasol.connect(dsn=EXASOL_EXTERNAL_HOST, user=EXASOL_USER, password=EXASOL_PASSWORD, compression=True) as conn:\n",
"with pyexasol.connect(dsn=conf.EXTERNAL_HOST, user=conf.USER, password=conf.PASSWORD, compression=True) as conn:\n",
"\n",
" # Create tables\n",
" conn.execute(query='CREATE OR REPLACE TABLE {schema!i}.ABALONE_TRAIN(' + ', '.join(column_desc) + ')', query_params=params)\n",
" conn.execute(query='CREATE OR REPLACE TABLE {schema!i}.ABALONE_TEST LIKE {schema!i}.ABALONE_TRAIN', query_params=params)\n",
" sql = f'CREATE OR REPLACE TABLE {{schema!i}}.ABALONE_TRAIN({\", \".join(column_desc)})'\n",
" conn.execute(query=sql, query_params=params)\n",
" sql = 'CREATE OR REPLACE TABLE {schema!i}.ABALONE_TEST LIKE {schema!i}.ABALONE_TRAIN'\n",
" conn.execute(query=sql, query_params=params)\n",
"\n",
" # Import data into Exasol\n",
" conn.import_from_pandas(df_train, (EXASOL_SCHEMA, \"ABALONE_TRAIN\"))\n",
" conn.import_from_pandas(df_train, (conf.SCHEMA, \"ABALONE_TRAIN\"))\n",
" print(f\"Imported {conn.last_statement().rowcount()} rows into TRAIN.\")\n",
" conn.import_from_pandas(df_test, (EXASOL_SCHEMA, \"ABALONE_TEST\"))\n",
" conn.import_from_pandas(df_test, (conf.SCHEMA, \"ABALONE_TEST\"))\n",
" print(f\"Imported {conn.last_statement().rowcount()} rows into TEST.\")\n",
"\n",
"print(f\"Importing the data took: {stopwatch}\")"
Expand Down
Loading

0 comments on commit 40c304f

Please sign in to comment.