-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial set of transformer notebooks
- Loading branch information
Showing
8 changed files
with
2,168 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,277 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "b25130b6-6e01-481a-91a0-838198be3ea8", | ||
"metadata": {}, | ||
"source": [ | ||
"In this notebook we will load and use a masked language model. This kind of a model predicts which words would replace masked words in a sentence. Learn more about the Fill-Mask task <a href=\"https://huggingface.co/tasks/fill-mask\" target=\"_blank\" rel=\"noopener\">here</a>.\n", | ||
"\n", | ||
"We will be running SQL queries using <a href=\"https://jupysql.ploomber.io/en/latest/quick-start.html\" target=\"_blank\" rel=\"noopener\"> JupySQL</a> SQL Magic.\n", | ||
"\n", | ||
"Prior to using this notebook one needs to complete the follow steps:\n", | ||
"1. [Create the database schema](../setup_db.ipynb).\n", | ||
"2. [Initialize the Transformer Extension](te_init.ipynb)." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "4884f64d-aee2-4248-a922-8d28cf70209f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# TODO: Move this to a separate configuration notebook. Here we just need to load this configuration from a store.\n", | ||
"from dataclasses import dataclass\n", | ||
"\n", | ||
"@dataclass\n", | ||
"class SandboxConfig:\n", | ||
" EXTERNAL_HOST_NAME = \"192.168.124.93\"\n", | ||
" HOST_PORT = \"8888\"\n", | ||
"\n", | ||
" @property\n", | ||
" def EXTERNAL_HOST(self):\n", | ||
" return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.HOST_PORT}\"\"\"\n", | ||
"\n", | ||
" USER = \"sys\"\n", | ||
" PASSWORD = \"exasol\"\n", | ||
" BUCKETFS_PORT = \"6666\"\n", | ||
" BUCKETFS_USER = \"w\"\n", | ||
" BUCKETFS_PASSWORD = \"write\"\n", | ||
" BUCKETFS_USE_HTTPS = False\n", | ||
" BUCKETFS_SERVICE = \"bfsdefault\"\n", | ||
" BUCKETFS_BUCKET = \"default\"\n", | ||
"\n", | ||
" @property\n", | ||
" def EXTERNAL_BUCKETFS_HOST(self):\n", | ||
" return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.BUCKETFS_PORT}\"\"\"\n", | ||
"\n", | ||
" @property\n", | ||
" def BUCKETFS_URL_PREFIX(self):\n", | ||
" return \"https://\" if self.BUCKETFS_USE_HTTPS else \"http://\"\n", | ||
"\n", | ||
" @property\n", | ||
" def BUCKETFS_PATH(self):\n", | ||
" # Filesystem-Path to the read-only mounted BucketFS inside the running UDF Container\n", | ||
" return f\"/buckets/{self.BUCKETFS_SERVICE}/{self.BUCKETFS_BUCKET}\"\n", | ||
"\n", | ||
" SCRIPT_LANGUAGE_NAME = \"PYTHON3_60\"\n", | ||
" UDF_FLAVOR = \"python3-ds-EXASOL-6.0.0\"\n", | ||
" UDF_RELEASE= \"20190116\"\n", | ||
" UDF_CLIENT = \"exaudfclient\" # or for newer versions of the flavor exaudfclient_py3\n", | ||
" SCHEMA = \"IDA\"\n", | ||
"\n", | ||
" @property\n", | ||
" def SCRIPT_LANGUAGES(self):\n", | ||
" return f\"\"\"{self.SCRIPT_LANGUAGE_NAME}=localzmq+protobuf:///{self.BUCKETFS_SERVICE}/\n", | ||
" {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}?lang=python#buckets/{self.BUCKETFS_SERVICE}/\n", | ||
" {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}/exaudf/{self.UDF_CLIENT}\"\"\";\n", | ||
"\n", | ||
" @property\n", | ||
" def connection_params(self):\n", | ||
" return {\"dns\": self.EXTERNAL_HOST, \"user\": self.USER, \"password\": self.PASSWORD, \"compression\": True}\n", | ||
"\n", | ||
" @property\n", | ||
" def params(self):\n", | ||
" return {\n", | ||
" \"script_languages\": self.SCRIPT_LANGUAGES,\n", | ||
" \"script_language_name\": self.SCRIPT_LANGUAGE_NAME,\n", | ||
" \"schema\": self.SCHEMA,\n", | ||
" \"BUCKETFS_PORT\": self.BUCKETFS_PORT,\n", | ||
" \"BUCKETFS_USER\": self.BUCKETFS_USER,\n", | ||
" \"BUCKETFS_PASSWORD\": self.BUCKETFS_PASSWORD,\n", | ||
" \"BUCKETFS_USE_HTTPS\": self.BUCKETFS_USE_HTTPS,\n", | ||
" \"BUCKETFS_BUCKET\": self.BUCKETFS_BUCKET,\n", | ||
" \"BUCKETFS_PATH\": self.BUCKETFS_PATH\n", | ||
" }\n", | ||
"\n", | ||
" # Name of the BucketFS connection\n", | ||
" BFS_CONN = 'MyBFSConn'\n", | ||
"\n", | ||
" # Name of a sub-directory of the bucket root\n", | ||
" BFS_DIR = 'my_storage'\n", | ||
"\n", | ||
" # We will store all models in this sub-directory at BucketFS\n", | ||
" TE_MODELS_DIR = 'models'\n", | ||
" \n", | ||
" # We will save cached model in this sub-directory relative to the current directory on the local machine.\n", | ||
" TE_MODELS_CACHE_DIR = 'models_cache'\n", | ||
"\n", | ||
" @property\n", | ||
" def WEBSOCKET_URL(self):\n", | ||
" return f\"exa+websocket://{self.USER}:{self.PASSWORD}@{self.EXTERNAL_HOST}/{self.SCHEMA}?SSLCertificate=SSL_VERIFY_NONE\"\n", | ||
"\n", | ||
"conf = SandboxConfig()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "ff882542-d473-4767-a035-5c2615080cae", | ||
"metadata": {}, | ||
"source": [ | ||
"First let's bring up the JupySQL and connect to the database via the SQLAlchemy. Please refer to the documentation in the <a href=\"https://github.com/exasol/sqlalchemy-exasol\" target=\"_blank\" rel=\"noopener\">sqlalchemy-exasol</a> for details on how to connect to the database using Exasol SQLAlchemy driver." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "f0cc45d2-d5ae-4afc-9f1d-251923995990", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from sqlalchemy import create_engine\n", | ||
"\n", | ||
"engine = create_engine(conf.WEBSOCKET_URL)\n", | ||
"\n", | ||
"%load_ext sql\n", | ||
"%sql engine" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "e0ad672a-8b26-467b-8649-0bf95b1efb61", | ||
"metadata": {}, | ||
"source": [ | ||
"Now we will download a model from the Huggingface Hub and put into the BucketFS.\n", | ||
"\n", | ||
"There are two ways of doing this.\n", | ||
"1. Using the `TE_MODEL_DOWNLOADER_UDF` UDF.\n", | ||
"2. Downloading a model to a local drive and subsequently uploading in into the BucketFS using a CLI.\n", | ||
"\n", | ||
"In this notebook we will use the second method.\n", | ||
"\n", | ||
"To demonstrate finding of a masked word task we will use a [RadBERT model](https://huggingface.co/StanfordAIMI/RadBERT) which was pre-trained on radiology reports. This is a public model, therefore the last parameter - the name of the Huggingface token connection - can be an empty string." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"id": "dede4beb-9bfe-413c-846a-a2e5c6eaa784", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# This is the name of the model at the Huggingface Hub\n", | ||
"MODEL_NAME = 'StanfordAIMI/RadBERT'" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "82d4acb0-3440-4e5a-b3fa-fdd45e19cb31", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from transformers import AutoTokenizer, AutoModelForMaskedLM\n", | ||
"\n", | ||
"tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=conf.TE_MODELS_CACHE_DIR)\n", | ||
"model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME, cache_dir=conf.TE_MODELS_CACHE_DIR)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "2322e677-b4ce-46cd-a807-544afb42b4b4", | ||
"metadata": {}, | ||
"source": [ | ||
"Now we can upload the model into the BucketFS using a command line. Unfortunately we cannot tell exactly when this process has finished. Notebook's hourglass may not be a reliable indicator. BucketFS will still be doing some work when the call issued by the notebook returns. Please wait for few moments after that, before querying the model." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 13, | ||
"id": "93b5d4ca-ad27-4421-bad3-3a98851db70d", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"upload_command = f\"\"\"python -m exasol_transformers_extension.upload_model \\\n", | ||
" --bucketfs-name {conf.BUCKETFS_SERVICE} \\\n", | ||
" --bucketfs-host {conf.EXTERNAL_HOST_NAME} \\\n", | ||
" --bucketfs-port {conf.BUCKETFS_PORT} \\\n", | ||
" --bucketfs-user {conf.BUCKETFS_USER} \\\n", | ||
" --bucketfs-password {conf.BUCKETFS_PASSWORD} \\\n", | ||
" --bucket {conf.BUCKETFS_BUCKET} \\\n", | ||
" --path-in-bucket {conf.BFS_DIR} \\\n", | ||
" --model-name {MODEL_NAME} \\\n", | ||
" --sub-dir {conf.TE_MODELS_DIR} \\\n", | ||
" --local-model-path {conf.TE_MODELS_CACHE_DIR}\n", | ||
" \"\"\"\n", | ||
"!{upload_command}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "9825231e-f7fa-4011-9d11-f94890f6ba7d", | ||
"metadata": {}, | ||
"source": [ | ||
"Let's see if the model can find a masked word in an instruction usually given to a patient when radiographer is doing her chest X-ray." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "7c0ff55c-dada-4f5f-a923-8c86695723bf", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# This is a sentence with a masked word that will be given to the model.\n", | ||
"MY_TEXT = 'Take a deep [MASK] and hold it'\n", | ||
"\n", | ||
"# Make sure our text can be used in an SQL statement.\n", | ||
"MY_TEXT = MY_TEXT.replace(\"'\", \"''\")\n", | ||
"\n", | ||
"# We will collect 5 best answers." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "0d8288e3-0658-44a3-8fc6-92d6e1c4b3f1", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"%%sql\n", | ||
"WITH MODEL_OUTPUT AS\n", | ||
"(\n", | ||
" SELECT TE_FILLING_MASK_UDF(\n", | ||
" NULL,\n", | ||
" '{{conf.BFS_CONN}}',\n", | ||
" NULL,\n", | ||
" '{{conf.TE_MODELS_DIR}}',\n", | ||
" '{{MODEL_NAME}}',\n", | ||
" '{{MY_TEXT}}',\n", | ||
" 5\n", | ||
" )\n", | ||
")\n", | ||
"SELECT filled_text, score, error_message FROM MODEL_OUTPUT ORDER BY SCORE DESC" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "6a164578-5d2a-40e2-8bdd-2800765fe5c7", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.10" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.