From f7b762f7f9f01e8fc61c6df96cede0b36d1da8cd Mon Sep 17 00:00:00 2001 From: mibe Date: Mon, 23 Oct 2023 09:30:04 +0100 Subject: [PATCH] Initial set of transformer notebooks --- .../transformer/masked_modelling.ipynb | 277 +++++++++++++++ .../transformer/question_answering.ipynb | 329 ++++++++++++++++++ .../transformer/sequence_classification.ipynb | 269 ++++++++++++++ doc/tutorials/transformer/te_init.ipynb | 241 +++++++++++++ .../transformer/text_generation.ipynb | 270 ++++++++++++++ .../transformer/token_classification.ipynb | 259 ++++++++++++++ doc/tutorials/transformer/translation.ipynb | 268 ++++++++++++++ .../zero_shot_classification.ipynb | 255 ++++++++++++++ 8 files changed, 2168 insertions(+) create mode 100644 doc/tutorials/transformer/masked_modelling.ipynb create mode 100644 doc/tutorials/transformer/question_answering.ipynb create mode 100644 doc/tutorials/transformer/sequence_classification.ipynb create mode 100644 doc/tutorials/transformer/te_init.ipynb create mode 100644 doc/tutorials/transformer/text_generation.ipynb create mode 100644 doc/tutorials/transformer/token_classification.ipynb create mode 100644 doc/tutorials/transformer/translation.ipynb create mode 100644 doc/tutorials/transformer/zero_shot_classification.ipynb diff --git a/doc/tutorials/transformer/masked_modelling.ipynb b/doc/tutorials/transformer/masked_modelling.ipynb new file mode 100644 index 00000000..0ba8994b --- /dev/null +++ b/doc/tutorials/transformer/masked_modelling.ipynb @@ -0,0 +1,277 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b25130b6-6e01-481a-91a0-838198be3ea8", + "metadata": {}, + "source": [ + "In this notebook we will load and use a masked language model. This kind of a model predicts which words would replace masked words in a sentence. Learn more about the Fill-Mask task here.\n", + "\n", + "We will be running SQL queries using JupySQL SQL Magic.\n", + "\n", + "Prior to using this notebook one needs to complete the follow steps:\n", + "1. [Create the database schema](../setup_db.ipynb).\n", + "2. [Initialize the Transformer Extension](te_init.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4884f64d-aee2-4248-a922-8d28cf70209f", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Move this to a separate configuration notebook. Here we just need to load this configuration from a store.\n", + "from dataclasses import dataclass\n", + "\n", + "@dataclass\n", + "class SandboxConfig:\n", + " EXTERNAL_HOST_NAME = \"192.168.124.93\"\n", + " HOST_PORT = \"8888\"\n", + "\n", + " @property\n", + " def EXTERNAL_HOST(self):\n", + " return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.HOST_PORT}\"\"\"\n", + "\n", + " USER = \"sys\"\n", + " PASSWORD = \"exasol\"\n", + " BUCKETFS_PORT = \"6666\"\n", + " BUCKETFS_USER = \"w\"\n", + " BUCKETFS_PASSWORD = \"write\"\n", + " BUCKETFS_USE_HTTPS = False\n", + " BUCKETFS_SERVICE = \"bfsdefault\"\n", + " BUCKETFS_BUCKET = \"default\"\n", + "\n", + " @property\n", + " def EXTERNAL_BUCKETFS_HOST(self):\n", + " return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.BUCKETFS_PORT}\"\"\"\n", + "\n", + " @property\n", + " def BUCKETFS_URL_PREFIX(self):\n", + " return \"https://\" if self.BUCKETFS_USE_HTTPS else \"http://\"\n", + "\n", + " @property\n", + " def BUCKETFS_PATH(self):\n", + " # Filesystem-Path to the read-only mounted BucketFS inside the running UDF Container\n", + " return f\"/buckets/{self.BUCKETFS_SERVICE}/{self.BUCKETFS_BUCKET}\"\n", + "\n", + " SCRIPT_LANGUAGE_NAME = \"PYTHON3_60\"\n", + " UDF_FLAVOR = \"python3-ds-EXASOL-6.0.0\"\n", + " UDF_RELEASE= \"20190116\"\n", + " UDF_CLIENT = \"exaudfclient\" # or for newer versions of the flavor exaudfclient_py3\n", + " SCHEMA = \"IDA\"\n", + "\n", + " @property\n", + " def SCRIPT_LANGUAGES(self):\n", + " return f\"\"\"{self.SCRIPT_LANGUAGE_NAME}=localzmq+protobuf:///{self.BUCKETFS_SERVICE}/\n", + " {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}?lang=python#buckets/{self.BUCKETFS_SERVICE}/\n", + " {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}/exaudf/{self.UDF_CLIENT}\"\"\";\n", + "\n", + " @property\n", + " def connection_params(self):\n", + " return {\"dns\": self.EXTERNAL_HOST, \"user\": self.USER, \"password\": self.PASSWORD, \"compression\": True}\n", + "\n", + " @property\n", + " def params(self):\n", + " return {\n", + " \"script_languages\": self.SCRIPT_LANGUAGES,\n", + " \"script_language_name\": self.SCRIPT_LANGUAGE_NAME,\n", + " \"schema\": self.SCHEMA,\n", + " \"BUCKETFS_PORT\": self.BUCKETFS_PORT,\n", + " \"BUCKETFS_USER\": self.BUCKETFS_USER,\n", + " \"BUCKETFS_PASSWORD\": self.BUCKETFS_PASSWORD,\n", + " \"BUCKETFS_USE_HTTPS\": self.BUCKETFS_USE_HTTPS,\n", + " \"BUCKETFS_BUCKET\": self.BUCKETFS_BUCKET,\n", + " \"BUCKETFS_PATH\": self.BUCKETFS_PATH\n", + " }\n", + "\n", + " # Name of the BucketFS connection\n", + " BFS_CONN = 'MyBFSConn'\n", + "\n", + " # Name of a sub-directory of the bucket root\n", + " BFS_DIR = 'my_storage'\n", + "\n", + " # We will store all models in this sub-directory at BucketFS\n", + " TE_MODELS_DIR = 'models'\n", + " \n", + " # We will save cached model in this sub-directory relative to the current directory on the local machine.\n", + " TE_MODELS_CACHE_DIR = 'models_cache'\n", + "\n", + " @property\n", + " def WEBSOCKET_URL(self):\n", + " return f\"exa+websocket://{self.USER}:{self.PASSWORD}@{self.EXTERNAL_HOST}/{self.SCHEMA}?SSLCertificate=SSL_VERIFY_NONE\"\n", + "\n", + "conf = SandboxConfig()" + ] + }, + { + "cell_type": "markdown", + "id": "ff882542-d473-4767-a035-5c2615080cae", + "metadata": {}, + "source": [ + "First let's bring up the JupySQL and connect to the database via the SQLAlchemy. Please refer to the documentation in the sqlalchemy-exasol for details on how to connect to the database using Exasol SQLAlchemy driver." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f0cc45d2-d5ae-4afc-9f1d-251923995990", + "metadata": {}, + "outputs": [], + "source": [ + "from sqlalchemy import create_engine\n", + "\n", + "engine = create_engine(conf.WEBSOCKET_URL)\n", + "\n", + "%load_ext sql\n", + "%sql engine" + ] + }, + { + "cell_type": "markdown", + "id": "e0ad672a-8b26-467b-8649-0bf95b1efb61", + "metadata": {}, + "source": [ + "Now we will download a model from the Huggingface Hub and put into the BucketFS.\n", + "\n", + "There are two ways of doing this.\n", + "1. Using the `TE_MODEL_DOWNLOADER_UDF` UDF.\n", + "2. Downloading a model to a local drive and subsequently uploading in into the BucketFS using a CLI.\n", + "\n", + "In this notebook we will use the second method.\n", + "\n", + "To demonstrate finding of a masked word task we will use a [RadBERT model](https://huggingface.co/StanfordAIMI/RadBERT) which was pre-trained on radiology reports. This is a public model, therefore the last parameter - the name of the Huggingface token connection - can be an empty string." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "dede4beb-9bfe-413c-846a-a2e5c6eaa784", + "metadata": {}, + "outputs": [], + "source": [ + "# This is the name of the model at the Huggingface Hub\n", + "MODEL_NAME = 'StanfordAIMI/RadBERT'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82d4acb0-3440-4e5a-b3fa-fdd45e19cb31", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer, AutoModelForMaskedLM\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=conf.TE_MODELS_CACHE_DIR)\n", + "model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME, cache_dir=conf.TE_MODELS_CACHE_DIR)" + ] + }, + { + "cell_type": "markdown", + "id": "2322e677-b4ce-46cd-a807-544afb42b4b4", + "metadata": {}, + "source": [ + "Now we can upload the model into the BucketFS using a command line. Unfortunately we cannot tell exactly when this process has finished. Notebook's hourglass may not be a reliable indicator. BucketFS will still be doing some work when the call issued by the notebook returns. Please wait for few moments after that, before querying the model." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "93b5d4ca-ad27-4421-bad3-3a98851db70d", + "metadata": {}, + "outputs": [], + "source": [ + "upload_command = f\"\"\"python -m exasol_transformers_extension.upload_model \\\n", + " --bucketfs-name {conf.BUCKETFS_SERVICE} \\\n", + " --bucketfs-host {conf.EXTERNAL_HOST_NAME} \\\n", + " --bucketfs-port {conf.BUCKETFS_PORT} \\\n", + " --bucketfs-user {conf.BUCKETFS_USER} \\\n", + " --bucketfs-password {conf.BUCKETFS_PASSWORD} \\\n", + " --bucket {conf.BUCKETFS_BUCKET} \\\n", + " --path-in-bucket {conf.BFS_DIR} \\\n", + " --model-name {MODEL_NAME} \\\n", + " --sub-dir {conf.TE_MODELS_DIR} \\\n", + " --local-model-path {conf.TE_MODELS_CACHE_DIR}\n", + " \"\"\"\n", + "!{upload_command}" + ] + }, + { + "cell_type": "markdown", + "id": "9825231e-f7fa-4011-9d11-f94890f6ba7d", + "metadata": {}, + "source": [ + "Let's see if the model can find a masked word in an instruction usually given to a patient when radiographer is doing her chest X-ray." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7c0ff55c-dada-4f5f-a923-8c86695723bf", + "metadata": {}, + "outputs": [], + "source": [ + "# This is a sentence with a masked word that will be given to the model.\n", + "MY_TEXT = 'Take a deep [MASK] and hold it'\n", + "\n", + "# Make sure our text can be used in an SQL statement.\n", + "MY_TEXT = MY_TEXT.replace(\"'\", \"''\")\n", + "\n", + "# We will collect 5 best answers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d8288e3-0658-44a3-8fc6-92d6e1c4b3f1", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "WITH MODEL_OUTPUT AS\n", + "(\n", + " SELECT TE_FILLING_MASK_UDF(\n", + " NULL,\n", + " '{{conf.BFS_CONN}}',\n", + " NULL,\n", + " '{{conf.TE_MODELS_DIR}}',\n", + " '{{MODEL_NAME}}',\n", + " '{{MY_TEXT}}',\n", + " 5\n", + " )\n", + ")\n", + "SELECT filled_text, score, error_message FROM MODEL_OUTPUT ORDER BY SCORE DESC" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a164578-5d2a-40e2-8bdd-2800765fe5c7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/tutorials/transformer/question_answering.ipynb b/doc/tutorials/transformer/question_answering.ipynb new file mode 100644 index 00000000..98055ac8 --- /dev/null +++ b/doc/tutorials/transformer/question_answering.ipynb @@ -0,0 +1,329 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0faec19c-8e4f-4ae8-8772-38dcca47d9e4", + "metadata": {}, + "source": [ + "In this notebook we will load and use a question answering language model that can retrieve the answer to a question from a given text. Learn more about the Question Answering task here.\n", + "\n", + "We will be running SQL queries using JupySQL SQL Magic.\n", + "\n", + "Prior to using this notebook one needs to complete the follow steps:\n", + "1. [Create the database schema](../setup_db.ipynb).\n", + "2. [Initialize the Transformer Extension](te_init.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a28bb232-38d5-445a-9e7a-6f72e80bc2cc", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Move this to a separate configuration notebook. Here we just need to load this configuration from a store.\n", + "from dataclasses import dataclass\n", + "\n", + "@dataclass\n", + "class SandboxConfig:\n", + " EXTERNAL_HOST_NAME = \"192.168.124.93\"\n", + " HOST_PORT = \"8888\"\n", + "\n", + " @property\n", + " def EXTERNAL_HOST(self):\n", + " return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.HOST_PORT}\"\"\"\n", + "\n", + " USER = \"sys\"\n", + " PASSWORD = \"exasol\"\n", + " BUCKETFS_PORT = \"6666\"\n", + " BUCKETFS_USER = \"w\"\n", + " BUCKETFS_PASSWORD = \"write\"\n", + " BUCKETFS_USE_HTTPS = False\n", + " BUCKETFS_SERVICE = \"bfsdefault\"\n", + " BUCKETFS_BUCKET = \"default\"\n", + "\n", + " @property\n", + " def EXTERNAL_BUCKETFS_HOST(self):\n", + " return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.BUCKETFS_PORT}\"\"\"\n", + "\n", + " @property\n", + " def BUCKETFS_URL_PREFIX(self):\n", + " return \"https://\" if self.BUCKETFS_USE_HTTPS else \"http://\"\n", + "\n", + " @property\n", + " def BUCKETFS_PATH(self):\n", + " # Filesystem-Path to the read-only mounted BucketFS inside the running UDF Container\n", + " return f\"/buckets/{self.BUCKETFS_SERVICE}/{self.BUCKETFS_BUCKET}\"\n", + "\n", + " SCRIPT_LANGUAGE_NAME = \"PYTHON3_60\"\n", + " UDF_FLAVOR = \"python3-ds-EXASOL-6.0.0\"\n", + " UDF_RELEASE= \"20190116\"\n", + " UDF_CLIENT = \"exaudfclient\" # or for newer versions of the flavor exaudfclient_py3\n", + " SCHEMA = \"IDA\"\n", + "\n", + " @property\n", + " def SCRIPT_LANGUAGES(self):\n", + " return f\"\"\"{self.SCRIPT_LANGUAGE_NAME}=localzmq+protobuf:///{self.BUCKETFS_SERVICE}/\n", + " {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}?lang=python#buckets/{self.BUCKETFS_SERVICE}/\n", + " {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}/exaudf/{self.UDF_CLIENT}\"\"\";\n", + "\n", + " @property\n", + " def connection_params(self):\n", + " return {\"dns\": self.EXTERNAL_HOST, \"user\": self.USER, \"password\": self.PASSWORD, \"compression\": True}\n", + "\n", + " @property\n", + " def params(self):\n", + " return {\n", + " \"script_languages\": self.SCRIPT_LANGUAGES,\n", + " \"script_language_name\": self.SCRIPT_LANGUAGE_NAME,\n", + " \"schema\": self.SCHEMA,\n", + " \"BUCKETFS_PORT\": self.BUCKETFS_PORT,\n", + " \"BUCKETFS_USER\": self.BUCKETFS_USER,\n", + " \"BUCKETFS_PASSWORD\": self.BUCKETFS_PASSWORD,\n", + " \"BUCKETFS_USE_HTTPS\": self.BUCKETFS_USE_HTTPS,\n", + " \"BUCKETFS_BUCKET\": self.BUCKETFS_BUCKET,\n", + " \"BUCKETFS_PATH\": self.BUCKETFS_PATH\n", + " }\n", + "\n", + " # Name of the BucketFS connection\n", + " BFS_CONN = 'MyBFSConn'\n", + "\n", + " # Name of a sub-directory of the bucket root\n", + " BFS_DIR = 'my_storage'\n", + "\n", + " # We will store all models in this sub-directory at BucketFS\n", + " TE_MODELS_DIR = 'models'\n", + " \n", + " # We will save cached model in this sub-directory relative to the current directory on the local machine.\n", + " TE_MODELS_CACHE_DIR = 'models_cache'\n", + "\n", + " @property\n", + " def WEBSOCKET_URL(self):\n", + " return f\"exa+websocket://{self.USER}:{self.PASSWORD}@{self.EXTERNAL_HOST}/{self.SCHEMA}?SSLCertificate=SSL_VERIFY_NONE\"\n", + "\n", + "conf = SandboxConfig()" + ] + }, + { + "cell_type": "markdown", + "id": "cd5e24a7-ecde-42cb-bdff-7df4fc4f5c84", + "metadata": {}, + "source": [ + "First let's bring up the JupySQL and connect to the database via the SQLAlchemy. Please refer to the documentation in the sqlalchemy-exasol for details on how to connect to the database using Exasol SQLAlchemy driver." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13fa8443-17f5-4f75-8c2a-3a86d13d7911", + "metadata": {}, + "outputs": [], + "source": [ + "from sqlalchemy import create_engine\n", + "\n", + "engine = create_engine(conf.WEBSOCKET_URL)\n", + "\n", + "%load_ext sql\n", + "%sql engine" + ] + }, + { + "cell_type": "markdown", + "id": "b60ef68c-3556-4742-91b9-836b31699e4c", + "metadata": {}, + "source": [ + "Now we will download a model from the Huggingface Hub and put into the BucketFS.\n", + "\n", + "There are two ways of doing this.\n", + "1. Using the `TE_MODEL_DOWNLOADER_UDF` UDF.\n", + "2. Downloading a model to a local drive and subsequently uploading in into the BucketFS using a CLI.\n", + "\n", + "In this notebook we will use the second method.\n", + "\n", + "To demonstrate the question answering task we will use a [roberta model](https://huggingface.co/deepset/roberta-base-squad2).\n", + "This is a public model, therefore the last parameter - the name of the Huggingface token connection - can be an empty string." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6244f4ff-5054-486c-86ef-4f2389744ed9", + "metadata": {}, + "outputs": [], + "source": [ + "# This is the name of the model at the Huggingface Hub\n", + "MODEL_NAME = 'deepset/roberta-base-squad2'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec3e4231-1122-4622-a43a-0545135b0a1e", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer, AutoModelForQuestionAnswering\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=conf.TE_MODELS_CACHE_DIR)\n", + "model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME, cache_dir=conf.TE_MODELS_CACHE_DIR)" + ] + }, + { + "cell_type": "markdown", + "id": "3122d4bf-4942-4ee1-a73f-3f61ac46c4af", + "metadata": {}, + "source": [ + "Now we can upload the model into the BucketFS using a command line. Unfortunately we cannot tell exactly when this process has finished. Notebook's hourglass may not be a reliable indicator. BucketFS will still be doing some work when the call issued by the notebook returns. Please wait for few moments after that, before querying the model." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b6642a83-3f69-4e2f-b1ef-59817faf307c", + "metadata": {}, + "outputs": [], + "source": [ + "upload_command = f\"\"\"python -m exasol_transformers_extension.upload_model \\\n", + " --bucketfs-name {conf.BUCKETFS_SERVICE} \\\n", + " --bucketfs-host {conf.EXTERNAL_HOST_NAME} \\\n", + " --bucketfs-port {conf.BUCKETFS_PORT} \\\n", + " --bucketfs-user {conf.BUCKETFS_USER} \\\n", + " --bucketfs-password {conf.BUCKETFS_PASSWORD} \\\n", + " --bucket {conf.BUCKETFS_BUCKET} \\\n", + " --path-in-bucket {conf.BFS_DIR} \\\n", + " --model-name {MODEL_NAME} \\\n", + " --sub-dir {conf.TE_MODELS_DIR} \\\n", + " --local-model-path {conf.TE_MODELS_CACHE_DIR}\n", + " \"\"\"\n", + "!{upload_command}" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ccf3eb8d-6fcf-4d49-ba5f-0b8488aa7b0d", + "metadata": {}, + "outputs": [], + "source": [ + "# This will be our question\n", + "TEST_QUESTION = 'What bitumen is used for?'\n", + "\n", + "# Let's first try it with the following context\n", + "TEST_CONTEXT1 = \"\"\"\n", + "Apart from stylish design features of new flat roofs, the other thing that’s moved on considerably is the technology\n", + "used to keep them weather-proof. Once flat roofs were notoriously prone to leaking and the problem could only be solved\n", + "with a boiling cauldron of tar. These days there are patch repair kits, liquid rubber membranes and even quick,\n", + "efficient waterproofing paint that last for ages – and can even be applied in damp weather.\n", + "\"\"\"\n", + "\n", + "# Make sure our texts can be used in an SQL statement.\n", + "TEST_QUESTION = TEST_QUESTION.replace(\"'\", \"''\")\n", + "TEST_CONTEXT1 = TEST_CONTEXT1.replace(\"'\", \"''\")\n", + "\n", + "# We will collect 5 best answers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "346b1b8a-b1aa-4bea-b351-83b8ee6ab7b7", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "WITH MODEL_OUTPUT AS\n", + "(\n", + " SELECT TE_QUESTION_ANSWERING_UDF(\n", + " NULL,\n", + " '{{conf.BFS_CONN}}',\n", + " NULL,\n", + " '{{conf.TE_MODELS_DIR}}',\n", + " '{{MODEL_NAME}}',\n", + " '{{TEST_QUESTION}}',\n", + " '{{TEST_CONTEXT1}}',\n", + " 5\n", + " )\n", + ")\n", + "SELECT answer, score, error_message FROM MODEL_OUTPUT ORDER BY SCORE DESC" + ] + }, + { + "cell_type": "markdown", + "id": "061139b0-2f3c-41c5-8176-d83789cb39e5", + "metadata": {}, + "source": [ + "Let's change the context and see a different set of answers." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "0787b6ac-f3db-4971-b010-0eac38bd1921", + "metadata": {}, + "outputs": [], + "source": [ + "# New context\n", + "TEST_CONTEXT2 = \"\"\"\n", + "You can make a wooden planter in a day, using treated timber. Simply work out how big an area you need,\n", + "cut the wood to size and follow our steps to putting the planter together. Make sure your wooden planter\n", + "has drainage holes, so plants don’t become waterlogged.\n", + "\"\"\"\n", + "\n", + "TEST_CONTEXT2 = TEST_CONTEXT2.replace(\"'\", \"''\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07b71013-b0ae-44c9-a299-ade8e307213c", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "WITH MODEL_OUTPUT AS\n", + "(\n", + " SELECT TE_QUESTION_ANSWERING_UDF(\n", + " NULL,\n", + " '{{conf.BFS_CONN}}',\n", + " NULL,\n", + " '{{conf.TE_MODELS_DIR}}',\n", + " '{{MODEL_NAME}}',\n", + " '{{TEST_QUESTION}}',\n", + " '{{TEST_CONTEXT2}}',\n", + " 5\n", + " )\n", + ")\n", + "SELECT answer, score, error_message FROM MODEL_OUTPUT ORDER BY SCORE DESC" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb7975eb-e339-4164-a636-24b82a382236", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/tutorials/transformer/sequence_classification.ipynb b/doc/tutorials/transformer/sequence_classification.ipynb new file mode 100644 index 00000000..b21b9a8c --- /dev/null +++ b/doc/tutorials/transformer/sequence_classification.ipynb @@ -0,0 +1,269 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8fa128b9-3c22-4f33-85c5-be5d9c5b290f", + "metadata": {}, + "source": [ + "In this notebook we will load and use a text classification language model that can assigning a label to a given text. Learn more about the Text Classification task here.\n", + "\n", + "We will be running SQL queries using JupySQL SQL Magic.\n", + "\n", + "Prior to using this notebook one needs to complete the follow steps:\n", + "1. [Create the database schema](../setup_db.ipynb).\n", + "2. [Initialize the Transformer Extension](te_init.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b4ef3baf-8292-4db0-b86b-88a110d6feb3", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Move this to a separate configuration notebook. Here we just need to load this configuration from a store.\n", + "from dataclasses import dataclass\n", + "\n", + "@dataclass\n", + "class SandboxConfig:\n", + " EXTERNAL_HOST_NAME = \"192.168.124.93\"\n", + " HOST_PORT = \"8888\"\n", + "\n", + " @property\n", + " def EXTERNAL_HOST(self):\n", + " return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.HOST_PORT}\"\"\"\n", + "\n", + " USER = \"sys\"\n", + " PASSWORD = \"exasol\"\n", + " BUCKETFS_PORT = \"6666\"\n", + " BUCKETFS_USER = \"w\"\n", + " BUCKETFS_PASSWORD = \"write\"\n", + " BUCKETFS_USE_HTTPS = False\n", + " BUCKETFS_SERVICE = \"bfsdefault\"\n", + " BUCKETFS_BUCKET = \"default\"\n", + "\n", + " @property\n", + " def EXTERNAL_BUCKETFS_HOST(self):\n", + " return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.BUCKETFS_PORT}\"\"\"\n", + "\n", + " @property\n", + " def BUCKETFS_URL_PREFIX(self):\n", + " return \"https://\" if self.BUCKETFS_USE_HTTPS else \"http://\"\n", + "\n", + " @property\n", + " def BUCKETFS_PATH(self):\n", + " # Filesystem-Path to the read-only mounted BucketFS inside the running UDF Container\n", + " return f\"/buckets/{self.BUCKETFS_SERVICE}/{self.BUCKETFS_BUCKET}\"\n", + "\n", + " SCRIPT_LANGUAGE_NAME = \"PYTHON3_60\"\n", + " UDF_FLAVOR = \"python3-ds-EXASOL-6.0.0\"\n", + " UDF_RELEASE= \"20190116\"\n", + " UDF_CLIENT = \"exaudfclient\" # or for newer versions of the flavor exaudfclient_py3\n", + " SCHEMA = \"IDA\"\n", + "\n", + " @property\n", + " def SCRIPT_LANGUAGES(self):\n", + " return f\"\"\"{self.SCRIPT_LANGUAGE_NAME}=localzmq+protobuf:///{self.BUCKETFS_SERVICE}/\n", + " {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}?lang=python#buckets/{self.BUCKETFS_SERVICE}/\n", + " {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}/exaudf/{self.UDF_CLIENT}\"\"\";\n", + "\n", + " @property\n", + " def connection_params(self):\n", + " return {\"dns\": self.EXTERNAL_HOST, \"user\": self.USER, \"password\": self.PASSWORD, \"compression\": True}\n", + "\n", + " @property\n", + " def params(self):\n", + " return {\n", + " \"script_languages\": self.SCRIPT_LANGUAGES,\n", + " \"script_language_name\": self.SCRIPT_LANGUAGE_NAME,\n", + " \"schema\": self.SCHEMA,\n", + " \"BUCKETFS_PORT\": self.BUCKETFS_PORT,\n", + " \"BUCKETFS_USER\": self.BUCKETFS_USER,\n", + " \"BUCKETFS_PASSWORD\": self.BUCKETFS_PASSWORD,\n", + " \"BUCKETFS_USE_HTTPS\": self.BUCKETFS_USE_HTTPS,\n", + " \"BUCKETFS_BUCKET\": self.BUCKETFS_BUCKET,\n", + " \"BUCKETFS_PATH\": self.BUCKETFS_PATH\n", + " }\n", + "\n", + " # Name of the BucketFS connection\n", + " BFS_CONN = 'MyBFSConn'\n", + "\n", + " # Name of a sub-directory of the bucket root\n", + " BFS_DIR = 'my_storage'\n", + "\n", + " # We will store all models in this sub-directory at BucketFS\n", + " TE_MODELS_DIR = 'models'\n", + " \n", + " # We will save cached model in this sub-directory relative to the current directory on the local machine.\n", + " TE_MODELS_CACHE_DIR = 'models_cache'\n", + "\n", + " @property\n", + " def WEBSOCKET_URL(self):\n", + " return f\"exa+websocket://{self.USER}:{self.PASSWORD}@{self.EXTERNAL_HOST}/{self.SCHEMA}?SSLCertificate=SSL_VERIFY_NONE\"\n", + "\n", + "conf = SandboxConfig()" + ] + }, + { + "cell_type": "markdown", + "id": "836f39d7-26f1-4419-bfa9-a0057a45380f", + "metadata": {}, + "source": [ + "First let's bring up the JupySQL and connect to the database via the SQLAlchemy. \n", + "Please refer to the documentation in the [sqlalchemy-exasol](https://github.com/exasol/sqlalchemy-exasol) for details on how to connect to the database using Exasol SQLAlchemy driver." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fe38507-de1a-417e-81d4-70c956a70914", + "metadata": {}, + "outputs": [], + "source": [ + "from sqlalchemy import create_engine\n", + "\n", + "engine = create_engine(conf.WEBSOCKET_URL)\n", + "\n", + "%load_ext sql\n", + "%sql engine" + ] + }, + { + "cell_type": "markdown", + "id": "aaee4351-6210-4266-9525-d35f4381ba30", + "metadata": {}, + "source": [ + "Now we will download a model from the Huggingface Hub and put into the BucketFS.\n", + "\n", + "There are two ways of doing this.\n", + "1. Using the `TE_MODEL_DOWNLOADER_UDF` UDF.\n", + "2. Downloading a model to a local drive and subsequently uploading in into the BucketFS using a CLI.\n", + "\n", + "The first method requires the database machine to have internet access. Here we assume this condition is met. Otherwise please refer to another notebook where the second method is demonstrated.\n", + "\n", + "To demonstrate the text classification task we will use the [Ekman emotions classifier](https://huggingface.co/arpanghoshal/EkmanClassifier) model.\n", + "\n", + "This is a public model, therefore the last parameter - the name of the Huggingface token connection - can be an empty string.\n", + "\n", + "Please note that loading a model, especially a big one, may take considerable time. At the time of writing we do not have any means to check the completion of this process. Notebook's hourglass may not be a reliable indicator. BucketFS will still be doing some work when the call issued by the notebook returns. Please wait for few moments after that, before querying the model." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5f1ff6db-1bd5-4886-b353-27f3c72db2bf", + "metadata": {}, + "outputs": [], + "source": [ + "# This is the name of the model at the Huggingface Hub\n", + "MODEL_NAME = 'arpanghoshal/EkmanClassifier'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72a23643-9ed2-4f17-8c05-3fa40db2d029", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT TE_MODEL_DOWNLOADER_UDF(\n", + " '{{MODEL_NAME}}',\n", + " '{{conf.TE_MODELS_DIR}}',\n", + " '{{conf.BFS_CONN}}',\n", + " ''\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "5cf5a2a7-ddab-40b8-aed1-7e6ddaec867a", + "metadata": {}, + "source": [ + "Let's try to classify a single phrase which definitely bears emotions but is also somewhat ambiguous - \"Oh my God!\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6a9e9b4-d9ce-410b-b374-a7d9f8feccea", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "WITH MODEL_OUTPUT AS\n", + "(\n", + " SELECT TE_SEQUENCE_CLASSIFICATION_SINGLE_TEXT_UDF(\n", + " NULL,\n", + " '{{conf.BFS_CONN}}',\n", + " NULL,\n", + " '{{conf.TE_MODELS_DIR}}',\n", + " '{{MODEL_NAME}}',\n", + " 'Oh my God!'\n", + " )\n", + ")\n", + "SELECT label, score, error_message FROM MODEL_OUTPUT ORDER BY SCORE DESC" + ] + }, + { + "cell_type": "markdown", + "id": "04ce1452-2571-447b-94f9-b923e6a2cb75", + "metadata": {}, + "source": [ + "Now we are going to add some context to our exclamation and use another UDF that takes a pair of sentences. Let's see how it will change the model output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1eb66252-36b6-441f-9e7d-bf9ffcb3f8df", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "WITH MODEL_OUTPUT AS\n", + "(\n", + " SELECT TE_SEQUENCE_CLASSIFICATION_TEXT_PAIR_UDF(\n", + " NULL,\n", + " '{{conf.BFS_CONN}}',\n", + " NULL,\n", + " '{{conf.TE_MODELS_DIR}}',\n", + " '{{MODEL_NAME}}',\n", + " 'Oh my God!',\n", + " 'I lost my purse.'\n", + " )\n", + ")\n", + "SELECT label, score, error_message FROM MODEL_OUTPUT ORDER BY SCORE DESC" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d8d1de0-a64d-4884-acfc-d4709bb33e58", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/tutorials/transformer/te_init.ipynb b/doc/tutorials/transformer/te_init.ipynb new file mode 100644 index 00000000..23cb69ea --- /dev/null +++ b/doc/tutorials/transformer/te_init.ipynb @@ -0,0 +1,241 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b0273b6a-8147-49d6-9591-c5434db58bc1", + "metadata": {}, + "source": [ + "Here we will perform all the necessary steps to get the Transformer Extension functionality up and running. Please refer to the Transformer Extension User Guide for details on the required initialization steps. Note the installation of the extension is included in the installation of this product, therefore the first step mentioned in the guide can be skipped.\n", + "\n", + "We will be using a generic prediction UDF script. To execute queries and load data from Exasol database we will be using the `pyexasol` module.\n", + "\n", + "Prior to using this notebook one needs to complete the follow steps:\n", + "1. [Create the database schema](../setup_db.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7433e5e3-5258-4773-b202-7aa1b05303ef", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Move this to a separate configuration notebook. Here we just need to load this configuration from a store.\n", + "from dataclasses import dataclass\n", + "\n", + "@dataclass\n", + "class SandboxConfig:\n", + " EXTERNAL_HOST_NAME = \"192.168.124.93\"\n", + " HOST_PORT = \"8888\"\n", + "\n", + " @property\n", + " def EXTERNAL_HOST(self):\n", + " return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.HOST_PORT}\"\"\"\n", + "\n", + " USER = \"sys\"\n", + " PASSWORD = \"exasol\"\n", + " BUCKETFS_PORT = \"6666\"\n", + " BUCKETFS_USER = \"w\"\n", + " BUCKETFS_PASSWORD = \"write\"\n", + " BUCKETFS_USE_HTTPS = False\n", + " BUCKETFS_SERVICE = \"bfsdefault\"\n", + " BUCKETFS_BUCKET = \"default\"\n", + "\n", + " @property\n", + " def EXTERNAL_BUCKETFS_HOST(self):\n", + " return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.BUCKETFS_PORT}\"\"\"\n", + "\n", + " @property\n", + " def BUCKETFS_URL_PREFIX(self):\n", + " return \"https://\" if self.BUCKETFS_USE_HTTPS else \"http://\"\n", + "\n", + " @property\n", + " def BUCKETFS_PATH(self):\n", + " # Filesystem-Path to the read-only mounted BucketFS inside the running UDF Container\n", + " return f\"/buckets/{self.BUCKETFS_SERVICE}/{self.BUCKETFS_BUCKET}\"\n", + "\n", + " SCRIPT_LANGUAGE_NAME = \"PYTHON3_60\"\n", + " UDF_FLAVOR = \"python3-ds-EXASOL-6.0.0\"\n", + " UDF_RELEASE= \"20190116\"\n", + " UDF_CLIENT = \"exaudfclient\" # or for newer versions of the flavor exaudfclient_py3\n", + " SCHEMA = \"IDA\"\n", + "\n", + " @property\n", + " def SCRIPT_LANGUAGES(self):\n", + " return f\"\"\"{self.SCRIPT_LANGUAGE_NAME}=localzmq+protobuf:///{self.BUCKETFS_SERVICE}/\n", + " {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}?lang=python#buckets/{self.BUCKETFS_SERVICE}/\n", + " {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}/exaudf/{self.UDF_CLIENT}\"\"\";\n", + "\n", + " @property\n", + " def connection_params(self):\n", + " return {\"dns\": self.EXTERNAL_HOST, \"user\": self.USER, \"password\": self.PASSWORD, \"compression\": True}\n", + "\n", + " @property\n", + " def params(self):\n", + " return {\n", + " \"script_languages\": self.SCRIPT_LANGUAGES,\n", + " \"script_language_name\": self.SCRIPT_LANGUAGE_NAME,\n", + " \"schema\": self.SCHEMA,\n", + " \"BUCKETFS_PORT\": self.BUCKETFS_PORT,\n", + " \"BUCKETFS_USER\": self.BUCKETFS_USER,\n", + " \"BUCKETFS_PASSWORD\": self.BUCKETFS_PASSWORD,\n", + " \"BUCKETFS_USE_HTTPS\": self.BUCKETFS_USE_HTTPS,\n", + " \"BUCKETFS_BUCKET\": self.BUCKETFS_BUCKET,\n", + " \"BUCKETFS_PATH\": self.BUCKETFS_PATH\n", + " }\n", + "\n", + " # Name of the BucketFS connection\n", + " BFS_CONN = 'MyBFSConn'\n", + "\n", + " # Name of a sub-directory of the bucket root\n", + " BFS_DIR = 'my_storage'\n", + "\n", + " # We will store all models in this sub-directory at BucketFS\n", + " TE_MODELS_DIR = 'models'\n", + " \n", + " # We will save cached model in this sub-directory relative to the current directory on the local machine.\n", + " TE_MODELS_CACHE_DIR = 'models_cache'\n", + "\n", + "conf = SandboxConfig()" + ] + }, + { + "cell_type": "markdown", + "id": "2f52f81c-5b0d-45b9-be11-def723aa4d25", + "metadata": {}, + "source": [ + "First, let's upload into the BucketFS and activate the required Script-Language-Container (SLC). This can be done by running the command below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d66cd419-d4fd-4aee-8b76-32e358f29d9f", + "metadata": {}, + "outputs": [], + "source": [ + "# THIS COMMAND DOESN'T WORK WITH THE RELEASED VERSION OF THE EXTENSION.\n", + "# THE CONTAINER UPLOADING HAS TO BE DONE EXTERNALLY.\n", + "\n", + "deploy_command = f\"\"\"\n", + "python -m exasol_transformers_extension.deploy language-container \\\n", + " --dsn {conf.EXTERNAL_HOST} \\\n", + " --db-user {conf.USER} \\\n", + " --db-pass {conf.PASSWORD} \\\n", + " --bucketfs-name {conf.BUCKETFS_SERVICE} \\\n", + " --bucketfs-host {conf.EXTERNAL_HOST_NAME} \\\n", + " --bucketfs-port {conf.BUCKETFS_PORT} \\\n", + " --bucketfs-user {conf.BUCKETFS_USER} \\\n", + " --bucketfs-password {conf.BUCKETFS_PASSWORD} \\\n", + " --bucketfs-use-https {conf.BUCKETFS_USE_HTTPS} \\\n", + " --bucket {conf.BUCKETFS_BUCKET} \\\n", + " --path-in-bucket . \\\n", + " --language-alias {conf.SCRIPT_LANGUAGE_NAME} \\\n", + " --version 0.5.0\n", + "\"\"\"\n", + "\n", + "# !{deploy_command}\n", + "print(deploy_command)" + ] + }, + { + "cell_type": "markdown", + "id": "13d94696-6083-4099-853c-1193d6d111a0", + "metadata": {}, + "source": [ + "Now we shall upload all scripts into the database.\n", + "Note, that the SLC must be uploaded first. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "328340bf-aa91-4643-bb80-b16a9c08d4f4", + "metadata": {}, + "outputs": [], + "source": [ + "deploy_command = f\"\"\"\n", + "python -m exasol_transformers_extension.deploy scripts \\\n", + " --dsn {conf.EXTERNAL_HOST} \\\n", + " --db-user {conf.USER} \\\n", + " --db-pass {conf.PASSWORD} \\\n", + " --schema {conf.SCHEMA} \\\n", + " --language-alias {conf.SCRIPT_LANGUAGE_NAME} \\\n", + " --no-use-ssl-cert-validation\n", + "\"\"\"\n", + "print(deploy_command)\n", + "\n", + "!{deploy_command}" + ] + }, + { + "cell_type": "markdown", + "id": "63f239be-7e7d-4f1a-8e49-72040254d00d", + "metadata": {}, + "source": [ + "Let's create a connection to the BucketFS where we are going to store all our models. We will use this connection hereafter in the queries.\n", + "\n", + "Notice that we specify a sub-directory of the bucket root, e.g. \"my_storage\" (the name can be chosen arbitrarily). The BucketFS will create this sub-directory for us the first time we use the connection." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "85ca87cc-1e1e-4289-9898-f7c1e3747de7", + "metadata": {}, + "outputs": [], + "source": [ + "import pyexasol\n", + "\n", + "sql = f\"\"\"\n", + "CREATE OR REPLACE CONNECTION [{conf.BFS_CONN}]\n", + " TO '{conf.BUCKETFS_URL_PREFIX}{conf.EXTERNAL_BUCKETFS_HOST}/{conf.BUCKETFS_BUCKET}/{conf.BFS_DIR};{conf.BUCKETFS_SERVICE}'\n", + " USER {{BUCKETFS_USER!s}}\n", + " IDENTIFIED BY {{BUCKETFS_PASSWORD!s}}\n", + "\"\"\"\n", + "\n", + "with pyexasol.connect(dsn=conf.EXTERNAL_HOST, user=conf.USER, password=conf.PASSWORD, compression=True) as conn:\n", + " conn.execute(query=sql, query_params=conf.params)" + ] + }, + { + "cell_type": "markdown", + "id": "d6fc18b3-4e98-4f59-b22b-5ba87c9997cc", + "metadata": {}, + "source": [ + "Some models require the [Sacremoses tokenizer](https://github.com/alvations/sacremoses) to be installed in the local environment when they get downloaded. Let's make sure we have it installed by running the command below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "134fcd15-8b6f-4ebd-9bef-1431c371b437", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install sacremoses" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/tutorials/transformer/text_generation.ipynb b/doc/tutorials/transformer/text_generation.ipynb new file mode 100644 index 00000000..5e46de2f --- /dev/null +++ b/doc/tutorials/transformer/text_generation.ipynb @@ -0,0 +1,270 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b73a1de2-05df-49ab-bec9-897f11dbe9a9", + "metadata": {}, + "source": [ + "In this notebook we will load and use a generative language model that can produce a continuation for a given text. Learn more about the Text Generation task here.\n", + "\n", + "We will be using a generic prediction UDF script. To execute queries and load data from Exasol database we will be using the `pyexasol` module.\n", + "\n", + "Prior to using this notebook one needs to complete the follow steps:\n", + "1. [Create the database schema](../setup_db.ipynb).\n", + "2. [Initialize the Transformer Extension](te_init.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "83b93680-738d-4b70-aa51-117b10d63915", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Move this to a separate configuration notebook. Here we just need to load this configuration from a store.\n", + "from dataclasses import dataclass\n", + "\n", + "@dataclass\n", + "class SandboxConfig:\n", + " EXTERNAL_HOST_NAME = \"192.168.124.93\"\n", + " HOST_PORT = \"8888\"\n", + "\n", + " @property\n", + " def EXTERNAL_HOST(self):\n", + " return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.HOST_PORT}\"\"\"\n", + "\n", + " USER = \"sys\"\n", + " PASSWORD = \"exasol\"\n", + " BUCKETFS_PORT = \"6666\"\n", + " BUCKETFS_USER = \"w\"\n", + " BUCKETFS_PASSWORD = \"write\"\n", + " BUCKETFS_USE_HTTPS = False\n", + " BUCKETFS_SERVICE = \"bfsdefault\"\n", + " BUCKETFS_BUCKET = \"default\"\n", + "\n", + " @property\n", + " def EXTERNAL_BUCKETFS_HOST(self):\n", + " return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.BUCKETFS_PORT}\"\"\"\n", + "\n", + " @property\n", + " def BUCKETFS_URL_PREFIX(self):\n", + " return \"https://\" if self.BUCKETFS_USE_HTTPS else \"http://\"\n", + "\n", + " @property\n", + " def BUCKETFS_PATH(self):\n", + " # Filesystem-Path to the read-only mounted BucketFS inside the running UDF Container\n", + " return f\"/buckets/{self.BUCKETFS_SERVICE}/{self.BUCKETFS_BUCKET}\"\n", + "\n", + " SCRIPT_LANGUAGE_NAME = \"PYTHON3_60\"\n", + " UDF_FLAVOR = \"python3-ds-EXASOL-6.0.0\"\n", + " UDF_RELEASE= \"20190116\"\n", + " UDF_CLIENT = \"exaudfclient\" # or for newer versions of the flavor exaudfclient_py3\n", + " SCHEMA = \"IDA\"\n", + "\n", + " @property\n", + " def SCRIPT_LANGUAGES(self):\n", + " return f\"\"\"{self.SCRIPT_LANGUAGE_NAME}=localzmq+protobuf:///{self.BUCKETFS_SERVICE}/\n", + " {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}?lang=python#buckets/{self.BUCKETFS_SERVICE}/\n", + " {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}/exaudf/{self.UDF_CLIENT}\"\"\";\n", + "\n", + " @property\n", + " def connection_params(self):\n", + " return {\"dns\": self.EXTERNAL_HOST, \"user\": self.USER, \"password\": self.PASSWORD, \"compression\": True}\n", + "\n", + " @property\n", + " def params(self):\n", + " return {\n", + " \"script_languages\": self.SCRIPT_LANGUAGES,\n", + " \"script_language_name\": self.SCRIPT_LANGUAGE_NAME,\n", + " \"schema\": self.SCHEMA,\n", + " \"BUCKETFS_PORT\": self.BUCKETFS_PORT,\n", + " \"BUCKETFS_USER\": self.BUCKETFS_USER,\n", + " \"BUCKETFS_PASSWORD\": self.BUCKETFS_PASSWORD,\n", + " \"BUCKETFS_USE_HTTPS\": self.BUCKETFS_USE_HTTPS,\n", + " \"BUCKETFS_BUCKET\": self.BUCKETFS_BUCKET,\n", + " \"BUCKETFS_PATH\": self.BUCKETFS_PATH\n", + " }\n", + "\n", + " # Name of the BucketFS connection\n", + " BFS_CONN = 'MyBFSConn'\n", + "\n", + " # Name of a sub-directory of the bucket root\n", + " BFS_DIR = 'my_storage'\n", + "\n", + " # We will store all models in this sub-directory at BucketFS\n", + " TE_MODELS_DIR = 'models'\n", + " \n", + " # We will save cached model in this sub-directory relative to the current directory on the local machine.\n", + " TE_MODELS_CACHE_DIR = 'models_cache'\n", + "\n", + "conf = SandboxConfig()" + ] + }, + { + "cell_type": "markdown", + "id": "6d11a1d4-051d-4d21-990a-660beaeb8f0c", + "metadata": {}, + "source": [ + "First we need to download a model from the Huggingface Hub and put into the BucketFS.\n", + "\n", + "There are two ways of doing this.\n", + "1. Using the `TE_MODEL_DOWNLOADER_UDF` UDF.\n", + "2. Downloading a model to a local drive and subsequently uploading in into the BucketFS using a CLI.\n", + "\n", + "In this notebook we will use the second method.\n", + "\n", + "To demonstrate the text generation task we will use [Open Pretrained Transformers (OPT)](https://huggingface.co/facebook/opt-125m), a decoder-only pre-trained transformer from Facebook.\n", + "\n", + "This is a public model, therefore the last parameter - the name of the Huggingface token connection - can be an empty string." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "d73c9b44-93a0-4df3-9a8e-54e182027d61", + "metadata": {}, + "outputs": [], + "source": [ + "# This is the name of the model at the Huggingface Hub\n", + "MODEL_NAME = 'facebook/opt-125m'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57029a52-064b-4fda-80bf-289cce50ffc4", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=conf.TE_MODELS_CACHE_DIR)\n", + "model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, cache_dir=conf.TE_MODELS_CACHE_DIR)" + ] + }, + { + "cell_type": "markdown", + "id": "6cf6786e-1883-4ff7-8014-338e9cdedc19", + "metadata": {}, + "source": [ + "Now we can upload the model into the BucketFS using a command line. Unfortunately we cannot tell exactly when this process has finished. Notebook's hourglass may not be a reliable indicator. BucketFS will still be doing some work when the call issued by the notebook returns. Please wait for few moments after that, before querying the model." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "20b28eb8-0900-42be-96c3-94344abdcbc5", + "metadata": {}, + "outputs": [], + "source": [ + "upload_command = f\"\"\"python -m exasol_transformers_extension.upload_model \\\n", + " --bucketfs-name {conf.BUCKETFS_SERVICE} \\\n", + " --bucketfs-host {conf.EXTERNAL_HOST_NAME} \\\n", + " --bucketfs-port {conf.BUCKETFS_PORT} \\\n", + " --bucketfs-user {conf.BUCKETFS_USER} \\\n", + " --bucketfs-password {conf.BUCKETFS_PASSWORD} \\\n", + " --bucket {conf.BUCKETFS_BUCKET} \\\n", + " --path-in-bucket {conf.BFS_DIR} \\\n", + " --model-name {MODEL_NAME} \\\n", + " --sub-dir {conf.TE_MODELS_DIR} \\\n", + " --local-model-path {conf.TE_MODELS_CACHE_DIR}\n", + " \"\"\"\n", + "!{upload_command}" + ] + }, + { + "cell_type": "markdown", + "id": "b4efa927-aa78-4b80-9b78-25e722904217", + "metadata": {}, + "source": [ + "Let's put the start of our conversation in a variable." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "aa3998b7-d886-4b0c-b0d2-92e54cc27b91", + "metadata": {}, + "outputs": [], + "source": [ + "MY_TEXT = 'The bar-headed goose can fly at much'\n", + "\n", + "# Make sure our texts can be used in an SQL statement.\n", + "MY_TEXT = MY_TEXT.replace(\"'\", \"''\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "56498899-79f8-471e-8337-983184dcd513", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's put a limit on the length of text the model can generate in one call.\n", + "# The limit is specified in the number of characters.\n", + "MAX_LENGTH = 30" + ] + }, + { + "cell_type": "markdown", + "id": "27b1dd67-ffed-4bf8-9ee7-a1e003cdbcc6", + "metadata": {}, + "source": [ + "We will be updating this variable at every call to the model.\n", + "Please run the next cell multiple times to see how the text evolves." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc21bb8f-29e4-4718-b01a-ec3fafffeee0", + "metadata": {}, + "outputs": [], + "source": [ + "import pyexasol\n", + "\n", + "sql = f\"\"\"\n", + "SELECT {conf.SCHEMA}.TE_TEXT_GENERATION_UDF(\n", + " NULL,\n", + " '{conf.BFS_CONN}',\n", + " NULL,\n", + " '{conf.TE_MODELS_DIR}',\n", + " '{MODEL_NAME}',\n", + " '{MY_TEXT}',\n", + " {MAX_LENGTH},\n", + " True\n", + ")\n", + "\"\"\"\n", + "\n", + "with pyexasol.connect(dsn=conf.EXTERNAL_HOST, user=conf.USER, password=conf.PASSWORD, compression=True) as conn:\n", + " result = conn.export_to_pandas(query_or_table=sql, query_params=conf.params).squeeze()\n", + " MY_TEXT = result['GENERATED_TEXT']\n", + " # The error can be observed at result['ERROR_MESSAGE']\n", + "\n", + "print(MY_TEXT)\n", + "MY_TEXT = MY_TEXT.replace(\"'\", \"''\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/tutorials/transformer/token_classification.ipynb b/doc/tutorials/transformer/token_classification.ipynb new file mode 100644 index 00000000..0cdc90ca --- /dev/null +++ b/doc/tutorials/transformer/token_classification.ipynb @@ -0,0 +1,259 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "150390c4-509d-4c6e-a073-2b24adc6a434", + "metadata": {}, + "source": [ + "In this notebook we will load and use a token classifier language model that assigns labels to some tokens in a text. Learn more about the Question Answering task here.\n", + "\n", + "We will be running SQL queries using JupySQL SQL Magic.\n", + "\n", + "Prior to using this notebook one needs to complete the follow steps:\n", + "1. [Create the database schema](../setup_db.ipynb).\n", + "2. [Initialize the Transformer Extension](te_init.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "38d88106-5629-434c-8144-9c74ac4ccda2", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Move this to a separate configuration notebook. Here we just need to load this configuration from a store.\n", + "from dataclasses import dataclass\n", + "\n", + "@dataclass\n", + "class SandboxConfig:\n", + " EXTERNAL_HOST_NAME = \"192.168.124.93\"\n", + " HOST_PORT = \"8888\"\n", + "\n", + " @property\n", + " def EXTERNAL_HOST(self):\n", + " return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.HOST_PORT}\"\"\"\n", + "\n", + " USER = \"sys\"\n", + " PASSWORD = \"exasol\"\n", + " BUCKETFS_PORT = \"6666\"\n", + " BUCKETFS_USER = \"w\"\n", + " BUCKETFS_PASSWORD = \"write\"\n", + " BUCKETFS_USE_HTTPS = False\n", + " BUCKETFS_SERVICE = \"bfsdefault\"\n", + " BUCKETFS_BUCKET = \"default\"\n", + "\n", + " @property\n", + " def EXTERNAL_BUCKETFS_HOST(self):\n", + " return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.BUCKETFS_PORT}\"\"\"\n", + "\n", + " @property\n", + " def BUCKETFS_URL_PREFIX(self):\n", + " return \"https://\" if self.BUCKETFS_USE_HTTPS else \"http://\"\n", + "\n", + " @property\n", + " def BUCKETFS_PATH(self):\n", + " # Filesystem-Path to the read-only mounted BucketFS inside the running UDF Container\n", + " return f\"/buckets/{self.BUCKETFS_SERVICE}/{self.BUCKETFS_BUCKET}\"\n", + "\n", + " SCRIPT_LANGUAGE_NAME = \"PYTHON3_60\"\n", + " UDF_FLAVOR = \"python3-ds-EXASOL-6.0.0\"\n", + " UDF_RELEASE= \"20190116\"\n", + " UDF_CLIENT = \"exaudfclient\" # or for newer versions of the flavor exaudfclient_py3\n", + " SCHEMA = \"IDA\"\n", + "\n", + " @property\n", + " def SCRIPT_LANGUAGES(self):\n", + " return f\"\"\"{self.SCRIPT_LANGUAGE_NAME}=localzmq+protobuf:///{self.BUCKETFS_SERVICE}/\n", + " {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}?lang=python#buckets/{self.BUCKETFS_SERVICE}/\n", + " {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}/exaudf/{self.UDF_CLIENT}\"\"\";\n", + "\n", + " @property\n", + " def connection_params(self):\n", + " return {\"dns\": self.EXTERNAL_HOST, \"user\": self.USER, \"password\": self.PASSWORD, \"compression\": True}\n", + "\n", + " @property\n", + " def params(self):\n", + " return {\n", + " \"script_languages\": self.SCRIPT_LANGUAGES,\n", + " \"script_language_name\": self.SCRIPT_LANGUAGE_NAME,\n", + " \"schema\": self.SCHEMA,\n", + " \"BUCKETFS_PORT\": self.BUCKETFS_PORT,\n", + " \"BUCKETFS_USER\": self.BUCKETFS_USER,\n", + " \"BUCKETFS_PASSWORD\": self.BUCKETFS_PASSWORD,\n", + " \"BUCKETFS_USE_HTTPS\": self.BUCKETFS_USE_HTTPS,\n", + " \"BUCKETFS_BUCKET\": self.BUCKETFS_BUCKET,\n", + " \"BUCKETFS_PATH\": self.BUCKETFS_PATH\n", + " }\n", + "\n", + " # Name of the BucketFS connection\n", + " BFS_CONN = 'MyBFSConn'\n", + "\n", + " # Name of a sub-directory of the bucket root\n", + " BFS_DIR = 'my_storage'\n", + "\n", + " # We will store all models in this sub-directory at BucketFS\n", + " TE_MODELS_DIR = 'models'\n", + " \n", + " # We will save cached model in this sub-directory relative to the current directory on the local machine.\n", + " TE_MODELS_CACHE_DIR = 'models_cache'\n", + "\n", + " @property\n", + " def WEBSOCKET_URL(self):\n", + " return f\"exa+websocket://{self.USER}:{self.PASSWORD}@{self.EXTERNAL_HOST}/{self.SCHEMA}?SSLCertificate=SSL_VERIFY_NONE\"\n", + "\n", + "conf = SandboxConfig()" + ] + }, + { + "cell_type": "markdown", + "id": "226cfcbf-fef1-47ca-9134-4e9dc6f333a9", + "metadata": {}, + "source": [ + "First let's bring up the JupySQL and connect to the database via the SQLAlchemy. Please refer to the documentation in the sqlalchemy-exasol for details on how to connect to the database using Exasol SQLAlchemy driver." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9193a95-2d8c-4722-ab22-10b75d2c5253", + "metadata": {}, + "outputs": [], + "source": [ + "from sqlalchemy import create_engine\n", + "\n", + "engine = create_engine(conf.WEBSOCKET_URL)\n", + "\n", + "%load_ext sql\n", + "%sql engine" + ] + }, + { + "cell_type": "markdown", + "id": "4dfc8b6c-ace2-4f93-8bd1-f0a8ff496413", + "metadata": {}, + "source": [ + "Now we will download a model from the Huggingface Hub and put into the BucketFS.\n", + "\n", + "There are two ways of doing this.\n", + "1. Using the `TE_MODEL_DOWNLOADER_UDF` UDF.\n", + "2. Downloading a model to a local drive and subsequently uploading in into the BucketFS using a CLI.\n", + "\n", + "The first method requires the database machine to have internet access. Here we assume this condition is met. Otherwise please refer to another notebook where the second method is demonstrated.\n", + "\n", + "To demonstrate the token classification task we will use an [English Named Entity Recognition model](https://huggingface.co/sschet/biomedical-ner-all), trained on Maccrobat to recognize the bio-medical entities (107 entities) from a given text corpus (case reports etc.).\n", + "\n", + "This is a public model, therefore the last parameter - the name of the Huggingface token connection - can be an empty string.\n", + "\n", + "Please note that loading a model, especially a big one, may take considerable time. At the time of writing we do not have any means to check the completion of this process. Notebook's hourglass may not be a reliable indicator. BucketFS will still be doing some work when the call issued by the notebook returns. Please wait for few moments after that, before querying the model." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "26710f2d-a245-43e8-9ff2-292a60dbb19a", + "metadata": {}, + "outputs": [], + "source": [ + "# This is the name of the model at the Huggingface Hub\n", + "MODEL_NAME = 'sschet/biomedical-ner-all'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34747971-46a0-4774-93c0-40cea26706fa", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT TE_MODEL_DOWNLOADER_UDF(\n", + " '{{MODEL_NAME}}',\n", + " '{{conf.TE_MODELS_DIR}}',\n", + " '{{conf.BFS_CONN}}',\n", + " ''\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "59ec8aee-e40f-44e8-b94b-49c374b923f4", + "metadata": {}, + "outputs": [], + "source": [ + "# We will display all model output\n", + "%config SqlMagic.displaylimit = 0" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "cc5b5ab6-4481-4b95-bcaa-86ad4da0d6e4", + "metadata": {}, + "outputs": [], + "source": [ + "MY_TEXT = \"\"\"\n", + "A 63-year-old woman with no known cardiac history presented with a sudden onset of dyspnea requiring\n", + "intubation and ventilatory support out of hospital. She denied preceding symptoms of chest discomfort,\n", + "palpitations, syncope or infection. The patient was afebrile and normotensive, with a sinus tachycardia\n", + "of 140 beats/min.\n", + "\"\"\"\n", + "\n", + "# Make sure our texts can be used in an SQL statement.\n", + "MY_TEXT = MY_TEXT.replace(\"'\", \"''\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40242348-7e1d-4a12-94c2-4d185eaa5d40", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "WITH MODEL_OUTPUT AS\n", + "(\n", + " SELECT TE_TOKEN_CLASSIFICATION_UDF(\n", + " NULL,\n", + " '{{conf.BFS_CONN}}',\n", + " NULL,\n", + " '{{conf.TE_MODELS_DIR}}',\n", + " '{{MODEL_NAME}}',\n", + " '{{MY_TEXT}}',\n", + " NULL\n", + " )\n", + ")\n", + "SELECT start_pos, end_pos, word, entity, error_message FROM MODEL_OUTPUT ORDER BY start_pos, end_pos" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "551c7581-16a4-4992-b031-e4fffa09cc46", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/tutorials/transformer/translation.ipynb b/doc/tutorials/transformer/translation.ipynb new file mode 100644 index 00000000..fcf120e0 --- /dev/null +++ b/doc/tutorials/transformer/translation.ipynb @@ -0,0 +1,268 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1e0418df-fcce-493b-aa45-ee4e339e51f5", + "metadata": {}, + "source": [ + "In this notebook we will load and use a language translation model. Learn more about the Translation task here.\n", + "\n", + "We will be running SQL queries using JupySQL SQL Magic.\n", + "\n", + "Prior to using this notebook one needs to complete the follow steps:\n", + "1. [Create the database schema](../setup_db.ipynb).\n", + "2. [Initialize the Transformer Extension](te_init.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "91b9f7ee-524a-428b-aad8-ee3d4a009940", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Move this to a separate configuration notebook. Here we just need to load this configuration from a store.\n", + "from dataclasses import dataclass\n", + "\n", + "@dataclass\n", + "class SandboxConfig:\n", + " EXTERNAL_HOST_NAME = \"192.168.124.93\"\n", + " HOST_PORT = \"8888\"\n", + "\n", + " @property\n", + " def EXTERNAL_HOST(self):\n", + " return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.HOST_PORT}\"\"\"\n", + "\n", + " USER = \"sys\"\n", + " PASSWORD = \"exasol\"\n", + " BUCKETFS_PORT = \"6666\"\n", + " BUCKETFS_USER = \"w\"\n", + " BUCKETFS_PASSWORD = \"write\"\n", + " BUCKETFS_USE_HTTPS = False\n", + " BUCKETFS_SERVICE = \"bfsdefault\"\n", + " BUCKETFS_BUCKET = \"default\"\n", + "\n", + " @property\n", + " def EXTERNAL_BUCKETFS_HOST(self):\n", + " return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.BUCKETFS_PORT}\"\"\"\n", + "\n", + " @property\n", + " def BUCKETFS_URL_PREFIX(self):\n", + " return \"https://\" if self.BUCKETFS_USE_HTTPS else \"http://\"\n", + "\n", + " @property\n", + " def BUCKETFS_PATH(self):\n", + " # Filesystem-Path to the read-only mounted BucketFS inside the running UDF Container\n", + " return f\"/buckets/{self.BUCKETFS_SERVICE}/{self.BUCKETFS_BUCKET}\"\n", + "\n", + " SCRIPT_LANGUAGE_NAME = \"PYTHON3_60\"\n", + " UDF_FLAVOR = \"python3-ds-EXASOL-6.0.0\"\n", + " UDF_RELEASE= \"20190116\"\n", + " UDF_CLIENT = \"exaudfclient\" # or for newer versions of the flavor exaudfclient_py3\n", + " SCHEMA = \"IDA\"\n", + "\n", + " @property\n", + " def SCRIPT_LANGUAGES(self):\n", + " return f\"\"\"{self.SCRIPT_LANGUAGE_NAME}=localzmq+protobuf:///{self.BUCKETFS_SERVICE}/\n", + " {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}?lang=python#buckets/{self.BUCKETFS_SERVICE}/\n", + " {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}/exaudf/{self.UDF_CLIENT}\"\"\";\n", + "\n", + " @property\n", + " def connection_params(self):\n", + " return {\"dns\": self.EXTERNAL_HOST, \"user\": self.USER, \"password\": self.PASSWORD, \"compression\": True}\n", + "\n", + " @property\n", + " def params(self):\n", + " return {\n", + " \"script_languages\": self.SCRIPT_LANGUAGES,\n", + " \"script_language_name\": self.SCRIPT_LANGUAGE_NAME,\n", + " \"schema\": self.SCHEMA,\n", + " \"BUCKETFS_PORT\": self.BUCKETFS_PORT,\n", + " \"BUCKETFS_USER\": self.BUCKETFS_USER,\n", + " \"BUCKETFS_PASSWORD\": self.BUCKETFS_PASSWORD,\n", + " \"BUCKETFS_USE_HTTPS\": self.BUCKETFS_USE_HTTPS,\n", + " \"BUCKETFS_BUCKET\": self.BUCKETFS_BUCKET,\n", + " \"BUCKETFS_PATH\": self.BUCKETFS_PATH\n", + " }\n", + "\n", + " # Name of the BucketFS connection\n", + " BFS_CONN = 'MyBFSConn'\n", + "\n", + " # Name of a sub-directory of the bucket root\n", + " BFS_DIR = 'my_storage'\n", + "\n", + " # We will store all models in this sub-directory at BucketFS\n", + " TE_MODELS_DIR = 'models'\n", + " \n", + " # We will save cached model in this sub-directory relative to the current directory on the local machine.\n", + " TE_MODELS_CACHE_DIR = 'models_cache'\n", + "\n", + " @property\n", + " def WEBSOCKET_URL(self):\n", + " return f\"exa+websocket://{self.USER}:{self.PASSWORD}@{self.EXTERNAL_HOST}/{self.SCHEMA}?SSLCertificate=SSL_VERIFY_NONE\"\n", + "\n", + "conf = SandboxConfig()" + ] + }, + { + "cell_type": "markdown", + "id": "bd2d0acf-fd51-4b61-aeb5-e72b1f4be101", + "metadata": {}, + "source": [ + "First let's bring up the JupySQL and connect to the database via the SQLAlchemy. \n", + "Please refer to the documentation in the [sqlalchemy-exasol](https://github.com/exasol/sqlalchemy-exasol) for details on how to connect to the database using Exasol SQLAlchemy driver." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48b6a5bb-a1a6-4945-99a8-21fbdbc39392", + "metadata": {}, + "outputs": [], + "source": [ + "from sqlalchemy import create_engine\n", + "\n", + "engine = create_engine(conf.WEBSOCKET_URL)\n", + "\n", + "%load_ext sql\n", + "%sql engine" + ] + }, + { + "cell_type": "markdown", + "id": "7dd67dae-b65e-4d57-8a98-2baa07c18c9a", + "metadata": {}, + "source": [ + "Now we will download a model from the Huggingface Hub and put into the BucketFS.\n", + "\n", + "There are two ways of doing this.\n", + "1. Using the `TE_MODEL_DOWNLOADER_UDF` UDF.\n", + "2. Downloading a model to a local drive and subsequently uploading in into the BucketFS using a CLI.\n", + "\n", + "In this notebook we will use the second method.\n", + "\n", + "To demonstrate the translation task we will use [Facebook WMT19 English-German model](https://huggingface.co/facebook/wmt19-en-de).\n", + "\n", + "This is a public model, therefore the last parameter - the name of the Huggingface token connection - can be an empty string." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "db9dd34a-ff4e-44dc-9990-dd50f4be06d6", + "metadata": {}, + "outputs": [], + "source": [ + "# This is the name of the model at the Huggingface Hub\n", + "MODEL_NAME = 'facebook/wmt19-en-de'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50a8cc23-bba5-4497-8947-9adc8c83a76c", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=conf.TE_MODELS_CACHE_DIR)\n", + "model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, cache_dir=conf.TE_MODELS_CACHE_DIR)" + ] + }, + { + "cell_type": "markdown", + "id": "108df0f9-8404-4e6f-b7dc-5c31fa00799a", + "metadata": {}, + "source": [ + "Now we can upload the model into the BucketFS using a command line. Unfortunately we cannot tell exactly when this process has finished. Notebook's hourglass may not be a reliable indicator. BucketFS will still be doing some work when the call issued by the notebook returns. Please wait for few moments after that, before querying the model." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "2b25faa6-c75f-45de-9931-ff03a1bcde2e", + "metadata": {}, + "outputs": [], + "source": [ + "upload_command = f\"\"\"python -m exasol_transformers_extension.upload_model \\\n", + " --bucketfs-name {conf.BUCKETFS_SERVICE} \\\n", + " --bucketfs-host {conf.EXTERNAL_HOST_NAME} \\\n", + " --bucketfs-port {conf.BUCKETFS_PORT} \\\n", + " --bucketfs-user {conf.BUCKETFS_USER} \\\n", + " --bucketfs-password {conf.BUCKETFS_PASSWORD} \\\n", + " --bucket {conf.BUCKETFS_BUCKET} \\\n", + " --path-in-bucket {conf.BFS_DIR} \\\n", + " --model-name {MODEL_NAME} \\\n", + " --sub-dir {conf.TE_MODELS_DIR} \\\n", + " --local-model-path {conf.TE_MODELS_CACHE_DIR}\n", + " \"\"\"\n", + "!{upload_command}" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "7d96886e-d7ca-4b78-b192-b38bf37f0796", + "metadata": {}, + "outputs": [], + "source": [ + "MY_TEXT = 'We all live in a yellow submarine'\n", + "\n", + "# Make sure our text can be used in an SQL statement.\n", + "MY_TEXT = MY_TEXT.replace(\"'\", \"''\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ed978b7-e610-4989-8185-9f32e98c1649", + "metadata": {}, + "outputs": [], + "source": [ + "# BLOCKED BY THE ABSENCE OF THE sacremoses TOKENIZER IN THE SLC\n", + "%%sql\n", + "SELECT TE_TRANSLATION_UDF(\n", + " NULL,\n", + " '{{conf.BFS_CONN}}',\n", + " NULL,\n", + " '{{conf.TE_MODELS_DIR}}',\n", + " '{{MODEL_NAME}}',\n", + " '{{MY_TEXT}}',\n", + " '',\n", + " '',\n", + " 0\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55077fac-070a-4ab8-8752-84c99410abb4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/tutorials/transformer/zero_shot_classification.ipynb b/doc/tutorials/transformer/zero_shot_classification.ipynb new file mode 100644 index 00000000..699e0499 --- /dev/null +++ b/doc/tutorials/transformer/zero_shot_classification.ipynb @@ -0,0 +1,255 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "50507999-7f6c-4ee2-9d21-657f43dbbee8", + "metadata": {}, + "source": [ + "In this notebook we will load and use a zero shot classification language model. Learn about the Zero Shot Classification task here.\n", + "\n", + "We will be running SQL queries using JupySQL SQL Magic.\n", + "\n", + "Prior to using this notebook one needs to complete the follow steps:\n", + "1. [Create the database schema](../setup_db.ipynb).\n", + "2. [Initialize the Transformer Extension](te_init.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "f5c3c1e2-cd6d-4fa7-b992-01b4099b0a72", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Move this to a separate configuration notebook. Here we just need to load this configuration from a store.\n", + "from dataclasses import dataclass\n", + "\n", + "@dataclass\n", + "class SandboxConfig:\n", + " EXTERNAL_HOST_NAME = \"192.168.124.93\"\n", + " HOST_PORT = \"8888\"\n", + "\n", + " @property\n", + " def EXTERNAL_HOST(self):\n", + " return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.HOST_PORT}\"\"\"\n", + "\n", + " USER = \"sys\"\n", + " PASSWORD = \"exasol\"\n", + " BUCKETFS_PORT = \"6666\"\n", + " BUCKETFS_USER = \"w\"\n", + " BUCKETFS_PASSWORD = \"write\"\n", + " BUCKETFS_USE_HTTPS = False\n", + " BUCKETFS_SERVICE = \"bfsdefault\"\n", + " BUCKETFS_BUCKET = \"default\"\n", + "\n", + " @property\n", + " def EXTERNAL_BUCKETFS_HOST(self):\n", + " return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.BUCKETFS_PORT}\"\"\"\n", + "\n", + " @property\n", + " def BUCKETFS_URL_PREFIX(self):\n", + " return \"https://\" if self.BUCKETFS_USE_HTTPS else \"http://\"\n", + "\n", + " @property\n", + " def BUCKETFS_PATH(self):\n", + " # Filesystem-Path to the read-only mounted BucketFS inside the running UDF Container\n", + " return f\"/buckets/{self.BUCKETFS_SERVICE}/{self.BUCKETFS_BUCKET}\"\n", + "\n", + " SCRIPT_LANGUAGE_NAME = \"PYTHON3_60\"\n", + " UDF_FLAVOR = \"python3-ds-EXASOL-6.0.0\"\n", + " UDF_RELEASE= \"20190116\"\n", + " UDF_CLIENT = \"exaudfclient\" # or for newer versions of the flavor exaudfclient_py3\n", + " SCHEMA = \"IDA\"\n", + "\n", + " @property\n", + " def SCRIPT_LANGUAGES(self):\n", + " return f\"\"\"{self.SCRIPT_LANGUAGE_NAME}=localzmq+protobuf:///{self.BUCKETFS_SERVICE}/\n", + " {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}?lang=python#buckets/{self.BUCKETFS_SERVICE}/\n", + " {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}/exaudf/{self.UDF_CLIENT}\"\"\";\n", + "\n", + " @property\n", + " def connection_params(self):\n", + " return {\"dns\": self.EXTERNAL_HOST, \"user\": self.USER, \"password\": self.PASSWORD, \"compression\": True}\n", + "\n", + " @property\n", + " def params(self):\n", + " return {\n", + " \"script_languages\": self.SCRIPT_LANGUAGES,\n", + " \"script_language_name\": self.SCRIPT_LANGUAGE_NAME,\n", + " \"schema\": self.SCHEMA,\n", + " \"BUCKETFS_PORT\": self.BUCKETFS_PORT,\n", + " \"BUCKETFS_USER\": self.BUCKETFS_USER,\n", + " \"BUCKETFS_PASSWORD\": self.BUCKETFS_PASSWORD,\n", + " \"BUCKETFS_USE_HTTPS\": self.BUCKETFS_USE_HTTPS,\n", + " \"BUCKETFS_BUCKET\": self.BUCKETFS_BUCKET,\n", + " \"BUCKETFS_PATH\": self.BUCKETFS_PATH\n", + " }\n", + "\n", + " # Name of the BucketFS connection\n", + " BFS_CONN = 'MyBFSConn'\n", + "\n", + " # Name of a sub-directory of the bucket root\n", + " BFS_DIR = 'my_storage'\n", + "\n", + " # We will store all models in this sub-directory at BucketFS\n", + " TE_MODELS_DIR = 'models'\n", + " \n", + " # We will save cached model in this sub-directory relative to the current directory on the local machine.\n", + " TE_MODELS_CACHE_DIR = 'models_cache'\n", + "\n", + "conf = SandboxConfig()" + ] + }, + { + "cell_type": "markdown", + "id": "19c23f14-f56d-45d9-9b58-4c343a54e46c", + "metadata": {}, + "source": [ + "First let's bring up the JupySQL and connect to the database via the SQLAlchemy. Please refer to the documentation in the sqlalchemy-exasol for details on how to connect to the database using Exasol SQLAlchemy driver." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3874261b-9c57-48ef-a4e7-8a47af64fd15", + "metadata": {}, + "outputs": [], + "source": [ + "from sqlalchemy import create_engine\n", + "\n", + "engine = create_engine(conf.WEBSOCKET_URL)\n", + "\n", + "%load_ext sql\n", + "%sql engine" + ] + }, + { + "cell_type": "markdown", + "id": "af7aca6a-5479-41ec-936c-d2cac34b6b11", + "metadata": {}, + "source": [ + "Now we will download a model from the Huggingface Hub and put into the BucketFS.\n", + "\n", + "There are two ways of doing this.\n", + "1. Using the `TE_MODEL_DOWNLOADER_UDF` UDF.\n", + "2. Downloading a model to a local drive and subsequently uploading in into the BucketFS using a CLI.\n", + "\n", + "The first method requires the database machine to have internet access. Here we assume this condition is met. Otherwise please refer to another notebook where the second method is demonstrated.\n", + "\n", + "To demonstrate the zero shot classification task we will use the [DistilBERT base model](https://huggingface.co/typeform/distilbert-base-uncased-mnli).\n", + "\n", + "This is a public model, therefore the last parameter - the name of the Huggingface token connection - can be an empty string.\n", + "\n", + "Please note that loading a model, especially a big one, may take considerable time. At the time of writing we do not have any means to check the completion of this process. Notebook's hourglass may not be a reliable indicator. BucketFS will still be doing some work when the call issued by the notebook returns. Please wait for few moments after that, before querying the model." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "cc473cf7-957b-491c-8925-40e0341ab78a", + "metadata": {}, + "outputs": [], + "source": [ + "# This is the name of the model at the Huggingface Hub\n", + "MODEL_NAME = 'typeform/distilbert-base-uncased-mnli'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1d74058-92c2-48ae-b745-5651e32a419a", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT TE_MODEL_DOWNLOADER_UDF(\n", + " '{{MODEL_NAME}}',\n", + " '{{conf.TE_MODELS_DIR}}',\n", + " '{{conf.BFS_CONN}}',\n", + " ''\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "81271ed8-b61e-4aeb-ac9b-6a12d72de897", + "metadata": {}, + "outputs": [], + "source": [ + "# Text to be classified.\n", + "MY_TEXT = \"\"\"\n", + "A new model offers an explanation for how the Galilean satellites formed around the solar system’s largest world. \n", + "Konstantin Batygin did not set out to solve one of the solar system’s most puzzling mysteries when he went for a\n", + "run up a hill in Nice, France. Dr. Batygin, a Caltech researcher, best known for his contributions to the search\n", + "for the solar system’s missing “Planet Nine,” spotted a beer bottle. At a steep, 20 degree grade, he wondered why\n", + "it wasn’t rolling down the hill. He realized there was a breeze at his back holding the bottle in place. Then he\n", + "had a thought that would only pop into the mind of a theoretical astrophysicist: “Oh! This is how Europa formed.”\n", + "Europa is one of Jupiter’s four large Galilean moons. And in a paper published Monday in the Astrophysical Journal,\n", + "Dr. Batygin and a co-author, Alessandro Morbidelli, a planetary scientist at the Côte d’Azur Observatory in France,\n", + "present a theory explaining how some moons form around gas giants like Jupiter and Saturn, suggesting that\n", + "millimeter-sized grains of hail produced during the solar system’s formation became trapped around these massive\n", + "worlds, taking shape one at a time into the potentially habitable moons we know today.\n", + "\"\"\"\n", + "\n", + "# Make sure our texts can be used in an SQL statement.\n", + "MY_TEXT = MY_TEXT.replace(\"'\", \"''\")\n", + "\n", + "# Classes, not seen during model training.\n", + "MY_LABELS = 'space & cosmos, scientific discovery, microbiology, robots, archeology'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71cabf49-3e4e-4745-bb62-e76e5adeac0b", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "WITH MODEL_OUTPUT AS\n", + "(\n", + " SELECT TE_ZERO_SHOT_TEXT_CLASSIFICATION_UDF(\n", + " NULL,\n", + " '{{conf.BFS_CONN}}',\n", + " NULL,\n", + " '{{conf.TE_MODELS_DIR}}',\n", + " '{{MODEL_NAME}}',\n", + " '{{MY_TEXT}}',\n", + " '{{MY_LABELS}}'\n", + " )\n", + ")\n", + "SELECT label, score, error_message FROM MODEL_OUTPUT ORDER BY SCORE DESC" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb15a19a-e518-4089-8dff-6d56be6867f2", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}