From aadf5b4acc32ddbedd615f2958636027aeb6bcb6 Mon Sep 17 00:00:00 2001 From: Ilya Matiach Date: Fri, 22 Sep 2023 01:03:08 -0400 Subject: [PATCH] add RAI Text Insights question answering notebook on squad dataset --- ...ashboard-text-question-answering-squad.yml | 77 ++ sdk/python/README.md | 1 + ...hboard-text-question-answering-squad.ipynb | 993 ++++++++++++++++++ 3 files changed, 1071 insertions(+) create mode 100644 .github/workflows/sdk-responsible-ai-text-responsibleaidashboard-text-question-answering-squad.yml create mode 100644 sdk/python/responsible-ai/text/responsibleaidashboard-text-question-answering-squad.ipynb diff --git a/.github/workflows/sdk-responsible-ai-text-responsibleaidashboard-text-question-answering-squad.yml b/.github/workflows/sdk-responsible-ai-text-responsibleaidashboard-text-question-answering-squad.yml new file mode 100644 index 00000000000..eef9764eca0 --- /dev/null +++ b/.github/workflows/sdk-responsible-ai-text-responsibleaidashboard-text-question-answering-squad.yml @@ -0,0 +1,77 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: sdk-responsible-ai-text-responsibleaidashboard-text-question-answering-squad +# This file is created by sdk/python/readme.py. +# Please do not edit directly. +on: + workflow_dispatch: + schedule: + - cron: "50 9/12 * * *" + pull_request: + branches: + - main + paths: + - sdk/python/responsible-ai/text/** + - .github/workflows/sdk-responsible-ai-text-responsibleaidashboard-text-question-answering-squad.yml + - sdk/python/dev-requirements.txt + - infra/bootstrapping/** + - sdk/python/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: setup python + uses: actions/setup-python@v2 + with: + python-version: "3.8" + - name: pip install notebook reqs + run: pip install -r sdk/python/dev-requirements.txt + - name: pip install mlflow reqs + run: pip install -r sdk/python/mlflow-requirements.txt + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup SDK + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: sdk/python + continue-on-error: true + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: run responsible-ai/text/responsibleaidashboard-text-question-answering-squad.ipynb + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json"; + bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" replace_template_values "responsibleaidashboard-text-question-answering-squad.ipynb"; + [ -f "../../.azureml/config" ] && cat "../../.azureml/config"; + papermill -k python responsibleaidashboard-text-question-answering-squad.ipynb responsibleaidashboard-text-question-answering-squad.output.ipynb + working-directory: sdk/python/responsible-ai/text + - name: upload notebook's working folder as an artifact + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: responsibleaidashboard-text-question-answering-squad + path: sdk/python/responsible-ai/text diff --git a/sdk/python/README.md b/sdk/python/README.md index 5384807950e..3b0e2cfc1ee 100644 --- a/sdk/python/README.md +++ b/sdk/python/README.md @@ -226,6 +226,7 @@ Test Status is for branch - **_main_** |responsible-ai|text|[responsibleaidashboard-multilabel-text-classification-covid-events](responsible-ai/text/responsibleaidashboard-multilabel-text-classification-covid-events.ipynb)|*no description*|[![responsibleaidashboard-multilabel-text-classification-covid-events](https://github.com/Azure/azureml-examples/actions/workflows/sdk-responsible-ai-text-responsibleaidashboard-multilabel-text-classification-covid-events.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-responsible-ai-text-responsibleaidashboard-multilabel-text-classification-covid-events.yml)| |responsible-ai|text|[responsibleaidashboard-text-classification-DBPedia](responsible-ai/text/responsibleaidashboard-text-classification-DBPedia.ipynb)|*no description*|[![responsibleaidashboard-text-classification-DBPedia](https://github.com/Azure/azureml-examples/actions/workflows/sdk-responsible-ai-text-responsibleaidashboard-text-classification-DBPedia.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-responsible-ai-text-responsibleaidashboard-text-classification-DBPedia.yml)| |responsible-ai|text|[responsibleaidashboard-text-classification-blbooksgenre](responsible-ai/text/responsibleaidashboard-text-classification-blbooksgenre.ipynb)|*no description*|[![responsibleaidashboard-text-classification-blbooksgenre](https://github.com/Azure/azureml-examples/actions/workflows/sdk-responsible-ai-text-responsibleaidashboard-text-classification-blbooksgenre.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-responsible-ai-text-responsibleaidashboard-text-classification-blbooksgenre.yml)| +|responsible-ai|text|[responsibleaidashboard-text-question-answering-squad](responsible-ai/text/responsibleaidashboard-text-question-answering-squad.ipynb)|*no description*|[![responsibleaidashboard-text-question-answering-squad](https://github.com/Azure/azureml-examples/actions/workflows/sdk-responsible-ai-text-responsibleaidashboard-text-question-answering-squad.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-responsible-ai-text-responsibleaidashboard-text-question-answering-squad.yml)| |responsible-ai|vision|[responsibleaidashboard-automl-image-classification-fridge](responsible-ai/vision/responsibleaidashboard-automl-image-classification-fridge.ipynb)|*no description*|[![responsibleaidashboard-automl-image-classification-fridge](https://github.com/Azure/azureml-examples/actions/workflows/sdk-responsible-ai-vision-responsibleaidashboard-automl-image-classification-fridge.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-responsible-ai-vision-responsibleaidashboard-automl-image-classification-fridge.yml)| |responsible-ai|vision|[responsibleaidashboard-image-classification-fridge](responsible-ai/vision/responsibleaidashboard-image-classification-fridge.ipynb)|*no description*|[![responsibleaidashboard-image-classification-fridge](https://github.com/Azure/azureml-examples/actions/workflows/sdk-responsible-ai-vision-responsibleaidashboard-image-classification-fridge.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-responsible-ai-vision-responsibleaidashboard-image-classification-fridge.yml)| |responsible-ai|vision|[responsibleaidashboard-image-multilabel-classification-fridge](responsible-ai/vision/responsibleaidashboard-image-multilabel-classification-fridge.ipynb)|*no description*|[![responsibleaidashboard-image-multilabel-classification-fridge](https://github.com/Azure/azureml-examples/actions/workflows/sdk-responsible-ai-vision-responsibleaidashboard-image-multilabel-classification-fridge.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-responsible-ai-vision-responsibleaidashboard-image-multilabel-classification-fridge.yml)| diff --git a/sdk/python/responsible-ai/text/responsibleaidashboard-text-question-answering-squad.ipynb b/sdk/python/responsible-ai/text/responsibleaidashboard-text-question-answering-squad.ipynb new file mode 100644 index 00000000000..4f267f4f71b --- /dev/null +++ b/sdk/python/responsible-ai/text/responsibleaidashboard-text-question-answering-squad.ipynb @@ -0,0 +1,993 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "98605bcd", + "metadata": {}, + "source": [ + "# Text Question Answering scenario with RAI Dashboard\n", + "\n", + "The [Stanford Question Answering Dataset (SQuAD)](https://huggingface.co/datasets/squad) is a reading comprehension dataset on a set of Wikipeda articles. This notebook examines a huggingface question answering model evaluated on the dataset." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "80399184", + "metadata": {}, + "source": [ + "Install datasets to retrieve this dataset from huggingface:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0bc583b", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install datasets\n", + "%pip install \"pandas<2.0.0\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "bdd9bfba", + "metadata": {}, + "source": [ + "First, we need to specify the version of the RAI components which are available in the workspace. This was specified when the components were uploaded." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53b4eeac", + "metadata": {}, + "outputs": [], + "source": [ + "version_string = \"0.0.10\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "06008690", + "metadata": {}, + "source": [ + "We also need to give the name of the compute cluster we want to use in AzureML. Later in this notebook, we will create it if it does not already exist:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1ad79f9", + "metadata": {}, + "outputs": [], + "source": [ + "compute_name = \"cpucluster\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9fc65dc7", + "metadata": {}, + "source": [ + "Finally, we need to specify a version for the data and components we will create while running this notebook. This should be unique for the workspace, but the specific value doesn't matter:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78053935", + "metadata": {}, + "outputs": [], + "source": [ + "rai_example_version_string = \"1\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "73be2b63", + "metadata": {}, + "source": [ + "## Accessing the Data\n", + "\n", + "We supply the data as a pair of parquet files and accompanying `MLTable` file. We can download them, preprocess them, and take a brief look:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f875f18", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import datasets\n", + "import pandas as pd\n", + "\n", + "from sklearn import preprocessing\n", + "\n", + "NUM_TEST_SAMPLES = 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ccbfd923", + "metadata": {}, + "outputs": [], + "source": [ + "def load_dataset(split):\n", + " dataset = datasets.load_dataset(\"squad\", split=split)\n", + " questions = []\n", + " context = []\n", + " answers = []\n", + " for row in dataset:\n", + " context.append(row[\"context\"])\n", + " questions.append(row[\"question\"])\n", + " answers.append(row[\"answers\"][\"text\"][0])\n", + " return pd.DataFrame(\n", + " {\"context\": context, \"questions\": questions, \"answers\": answers}\n", + " )\n", + "\n", + "\n", + "pd_test_data = load_dataset(\"train\")\n", + "\n", + "test_data = pd_test_data[:NUM_TEST_SAMPLES]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "671d286f", + "metadata": {}, + "outputs": [], + "source": [ + "test_data" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "17d53df4", + "metadata": {}, + "source": [ + "Now create the mltable:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c7bbe58", + "metadata": {}, + "outputs": [], + "source": [ + "pq_filename = \"hf_data.parquet\"\n", + "\n", + "\n", + "def create_ml_table_file_contents(pq_filename):\n", + " return (\n", + " \"$schema: http://azureml/sdk-2-0/MLTable.json\\n\"\n", + " \"type: mltable\\n\"\n", + " \"paths:\\n\"\n", + " \" - file: ./{0}\\n\"\n", + " \"transformations:\\n\"\n", + " \" - read_parquet\\n\"\n", + " ).format(pq_filename)\n", + "\n", + "\n", + "def write_to_parquet(data, path, pq_filename):\n", + " os.makedirs(path, exist_ok=True)\n", + " data.to_parquet(os.path.join(path, pq_filename), index=False)\n", + "\n", + "\n", + "def create_ml_table_file(path, contents):\n", + " with open(os.path.join(path, \"MLTable\"), \"w\") as f:\n", + " f.write(contents)\n", + "\n", + "\n", + "test_data_path = \"test_data\"\n", + "\n", + "write_to_parquet(test_data, test_data_path, pq_filename)\n", + "\n", + "mltable_file_contents = create_ml_table_file_contents(pq_filename)\n", + "create_ml_table_file(test_data_path, mltable_file_contents)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "a2c4ebb4", + "metadata": {}, + "source": [ + "Load some data for a quick view:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1027fa92", + "metadata": {}, + "outputs": [], + "source": [ + "import mltable\n", + "\n", + "tbl = mltable.load(test_data_path)\n", + "test_df: pd.DataFrame = tbl.to_pandas_dataframe()\n", + "\n", + "display(test_df)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1115ac59", + "metadata": {}, + "source": [ + "The label column contains the answers:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b42df3d", + "metadata": {}, + "outputs": [], + "source": [ + "target_column_name = \"answers\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "52e79b04", + "metadata": {}, + "source": [ + "First, we need to upload the datasets to our workspace. We start by creating an `MLClient` for interactions with AzureML:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4570d606", + "metadata": {}, + "outputs": [], + "source": [ + "# Enter details of your AML workspace\n", + "subscription_id = \"\"\n", + "resource_group = \"\"\n", + "workspace = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "395435fc", + "metadata": {}, + "outputs": [], + "source": [ + "# Handle to the workspace\n", + "from azure.ai.ml import MLClient\n", + "from azure.identity import DefaultAzureCredential\n", + "\n", + "try:\n", + " credential = DefaultAzureCredential()\n", + " ml_client = MLClient(\n", + " credential=credential,\n", + " subscription_id=subscription_id,\n", + " resource_group_name=resource_group,\n", + " workspace_name=workspace,\n", + " )\n", + "except Exception:\n", + " # If in compute instance we can get the config automatically\n", + " from azureml.core import Workspace\n", + "\n", + " workspace = Workspace.from_config()\n", + " workspace.write_config()\n", + " ml_client = MLClient.from_config(\n", + " credential=DefaultAzureCredential(exclude_shared_token_cache_credential=True),\n", + " logging_enable=True,\n", + " )\n", + "\n", + "print(ml_client)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7b501735", + "metadata": {}, + "source": [ + "We can now upload the data to AzureML:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62eb02a2", + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.entities import Data\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "input_test_data = \"Squad_Test_MLTable\"\n", + "\n", + "try:\n", + " test_data = ml_client.data.get(\n", + " name=input_test_data,\n", + " version=rai_example_version_string,\n", + " )\n", + "except Exception:\n", + " test_data = Data(\n", + " path=test_data_path,\n", + " type=AssetTypes.MLTABLE,\n", + " description=\"RAI Squad test data\",\n", + " name=input_test_data,\n", + " version=rai_example_version_string,\n", + " )\n", + " ml_client.data.create_or_update(test_data)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6815ba75", + "metadata": {}, + "source": [ + "# Creating the Model\n", + "\n", + "To simplify the model creation process, we're going to use a pipeline.\n", + "\n", + "We create a directory for the training script:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e78d869b", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.makedirs(\"squad_component_src\", exist_ok=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ea86e55d", + "metadata": {}, + "source": [ + "Next, we write out our script to retrieve the huggingface question answering model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a523f144", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile squad_component_src/training_script.py\n", + "\n", + "import argparse\n", + "import logging\n", + "import json\n", + "import os\n", + "import time\n", + "\n", + "import mlflow\n", + "import mlflow.pyfunc\n", + "\n", + "from transformers import pipeline\n", + "\n", + "from azureml.core import Run\n", + "\n", + "from azureml.rai.utils import PyfuncModel\n", + "\n", + "\n", + "_logger = logging.getLogger(__file__)\n", + "logging.basicConfig(level=logging.INFO)\n", + "\n", + "def parse_args():\n", + " # setup arg parser\n", + " parser = argparse.ArgumentParser()\n", + "\n", + " # add arguments\n", + " parser.add_argument(\n", + " \"--model_output_path\", type=str, help=\"Path to write model info JSON\"\n", + " )\n", + " parser.add_argument(\n", + " \"--model_base_name\", type=str, help=\"Name of the registered model\"\n", + " )\n", + " parser.add_argument(\n", + " \"--model_name_suffix\", type=int, help=\"Set negative to use epoch_secs\"\n", + " )\n", + "\n", + " # parse args\n", + " args = parser.parse_args()\n", + "\n", + " # return args\n", + " return args\n", + "\n", + "\n", + "def main(args):\n", + " current_experiment = Run.get_context().experiment\n", + " tracking_uri = current_experiment.workspace.get_mlflow_tracking_uri()\n", + " _logger.info(\"tracking_uri: {0}\".format(tracking_uri))\n", + " mlflow.set_tracking_uri(tracking_uri)\n", + " mlflow.set_experiment(current_experiment.name)\n", + "\n", + " # load the question-answering model\n", + " pmodel = pipeline('question-answering')\n", + "\n", + " if args.model_name_suffix < 0:\n", + " suffix = int(time.time())\n", + " else:\n", + " suffix = args.model_name_suffix\n", + " registered_name = \"{0}_{1}\".format(args.model_base_name, suffix)\n", + " _logger.info(f\"Registering model as {registered_name}\")\n", + "\n", + " my_mlflow = PyfuncModel(pmodel)\n", + "\n", + " # Saving model with mlflow\n", + " _logger.info(\"Saving with mlflow\")\n", + " mlflow.pyfunc.log_model(\n", + " python_model=my_mlflow,\n", + " registered_model_name=registered_name,\n", + " artifact_path=registered_name,\n", + " )\n", + "\n", + " _logger.info(\"Writing JSON\")\n", + " dict = {\"id\": \"{0}:1\".format(registered_name)}\n", + " output_path = os.path.join(args.model_output_path, \"model_info.json\")\n", + " with open(output_path, \"w\") as of:\n", + " json.dump(dict, fp=of)\n", + "\n", + "\n", + "# run script\n", + "if __name__ == \"__main__\":\n", + " # add space in logs\n", + " print(\"*\" * 60)\n", + " print(\"\\n\\n\")\n", + "\n", + " # parse args\n", + " args = parse_args()\n", + "\n", + " # run main function\n", + " main(args)\n", + "\n", + " # add space in logs\n", + " print(\"*\" * 60)\n", + " print(\"\\n\\n\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e115dd6e", + "metadata": {}, + "source": [ + "Now, we can build this into an AzureML component:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d54e43f", + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml import load_component\n", + "\n", + "yaml_contents = f\"\"\"\n", + "$schema: http://azureml/sdk-2-0/CommandComponent.json\n", + "name: rai_training_component\n", + "display_name: Squad training component for RAI example\n", + "version: {rai_example_version_string}\n", + "type: command\n", + "inputs:\n", + " model_base_name:\n", + " type: string\n", + " model_name_suffix: # Set negative to use epoch_secs\n", + " type: integer\n", + " default: -1\n", + "outputs:\n", + " model_output_path:\n", + " type: path\n", + "code: ./squad_component_src/\n", + "environment: azureml://registries/azureml/environments/responsibleai-text-ubuntu20.04-py38-cpu/versions/30\n", + "command: >-\n", + " python training_script.py\n", + " --model_base_name ${{{{inputs.model_base_name}}}}\n", + " --model_name_suffix ${{{{inputs.model_name_suffix}}}}\n", + " --model_output_path ${{{{outputs.model_output_path}}}}\n", + "\"\"\"\n", + "\n", + "yaml_filename = \"SquadTextTrainingComp.yaml\"\n", + "\n", + "with open(yaml_filename, \"w\") as f:\n", + " f.write(yaml_contents)\n", + "\n", + "train_component_definition = load_component(source=yaml_filename)\n", + "\n", + "ml_client.components.create_or_update(train_component_definition)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6d165e2b", + "metadata": {}, + "source": [ + "We need a compute target on which to run our jobs. The following checks whether the compute specified above is present; if not, then the compute target is created." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e40fc38", + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.entities import AmlCompute\n", + "\n", + "all_compute_names = [x.name for x in ml_client.compute.list()]\n", + "\n", + "if compute_name in all_compute_names:\n", + " print(f\"Found existing compute: {compute_name}\")\n", + "else:\n", + " my_compute = AmlCompute(\n", + " name=compute_name,\n", + " size=\"STANDARD_DS3_V2\",\n", + " min_instances=0,\n", + " max_instances=4,\n", + " idle_time_before_scale_down=3600,\n", + " )\n", + " ml_client.compute.begin_create_or_update(my_compute)\n", + " print(\"Initiated compute creation\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9d8eb868", + "metadata": {}, + "source": [ + "## Running a training pipeline\n", + "\n", + "Now that we have our training component, we can run it. We begin by generating a unique name for the mode;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad76242b", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "model_base_name = \"hf_qa_model\"\n", + "model_name_suffix = \"12455\"\n", + "device = -1" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d49615a7", + "metadata": {}, + "source": [ + "Next, we define our training pipeline. This has two components. The first is the training component which we defined above. The second is a component to register the model in AzureML:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb6c6cec", + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml import dsl, Input\n", + "\n", + "train_model_component = ml_client.components.get(\n", + " name=\"rai_training_component\", version=rai_example_version_string\n", + ")\n", + "\n", + "\n", + "@dsl.pipeline(\n", + " compute=compute_name,\n", + " description=\"Register Model for RAI Squad example\",\n", + " experiment_name=f\"RAI_Squad_Example_Model_Training_{model_name_suffix}\",\n", + ")\n", + "def my_training_pipeline(model_base_name, model_name_suffix, device):\n", + " trained_model = train_component_definition(\n", + " model_base_name=model_base_name,\n", + " model_name_suffix=model_name_suffix,\n", + " )\n", + " trained_model.set_limits(timeout=3600)\n", + "\n", + " return {}\n", + "\n", + "\n", + "model_registration_pipeline_job = my_training_pipeline(\n", + " model_base_name, model_name_suffix, device\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2fa66ea6", + "metadata": {}, + "source": [ + "With the training pipeline defined, we can submit it for execution in AzureML. We define a helper function to wait for the job to complete:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f854eef5", + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.entities import PipelineJob\n", + "\n", + "\n", + "def submit_and_wait(ml_client, pipeline_job) -> PipelineJob:\n", + " created_job = ml_client.jobs.create_or_update(pipeline_job)\n", + " assert created_job is not None\n", + "\n", + " while created_job.status not in [\n", + " \"Completed\",\n", + " \"Failed\",\n", + " \"Canceled\",\n", + " \"NotResponding\",\n", + " ]:\n", + " time.sleep(30)\n", + " created_job = ml_client.jobs.get(created_job.name)\n", + " print(\"Latest status : {0}\".format(created_job.status))\n", + " assert created_job.status == \"Completed\"\n", + " return created_job\n", + "\n", + "\n", + "# This is the actual submission\n", + "training_job = submit_and_wait(ml_client, model_registration_pipeline_job)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "0722395e", + "metadata": {}, + "source": [ + "## Creating the RAI Text Insights\n", + "\n", + "Now that we have our model, we can generate RAI Text insights for it. We will need the `id` of the registered model, which will be as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d3e6e6e", + "metadata": {}, + "outputs": [], + "source": [ + "expected_model_id = f\"{model_base_name}_{model_name_suffix}:1\"\n", + "azureml_model_id = f\"azureml:{expected_model_id}\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "310aa659", + "metadata": {}, + "source": [ + "Next, we load the RAI components, so that we can construct a pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d67b942e", + "metadata": {}, + "outputs": [], + "source": [ + "squad_test_mltable = Input(\n", + " type=\"mltable\",\n", + " path=f\"{input_test_data}:{rai_example_version_string}\",\n", + " mode=\"download\",\n", + ")\n", + "\n", + "registry_name = \"azureml\"\n", + "credential = DefaultAzureCredential()\n", + "\n", + "ml_client_registry = MLClient(\n", + " credential=credential,\n", + " subscription_id=ml_client.subscription_id,\n", + " resource_group_name=ml_client.resource_group_name,\n", + " registry_name=registry_name,\n", + ")\n", + "\n", + "rai_text_insights_component = ml_client_registry.components.get(\n", + " name=\"rai_text_insights\", version=version_string\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c98cd2d9", + "metadata": {}, + "source": [ + "We can now specify our pipeline. Complex objects (such as lists of column names) have to be converted to JSON strings before being passed to the components." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a62105a7", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from azure.ai.ml import Input\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "\n", + "@dsl.pipeline(\n", + " compute=compute_name,\n", + " description=\"Example RAI computation on Squad data\",\n", + " experiment_name=f\"RAI_Squad_Example_RAIInsights_Computation_{model_name_suffix}\",\n", + ")\n", + "def rai_squad_question_answering_pipeline(\n", + " target_column_name,\n", + " test_data,\n", + " classes,\n", + " use_model_dependency,\n", + "):\n", + " # Initiate the RAIInsights\n", + " rai_text_job = rai_text_insights_component(\n", + " task_type=\"question_answering\",\n", + " model_info=expected_model_id,\n", + " model_input=Input(type=AssetTypes.MLFLOW_MODEL, path=azureml_model_id),\n", + " test_dataset=test_data,\n", + " target_column_name=target_column_name,\n", + " classes=classes,\n", + " use_model_dependency=use_model_dependency,\n", + " )\n", + " rai_text_job.set_limits(timeout=7200)\n", + "\n", + " rai_text_job.outputs.dashboard.mode = \"upload\"\n", + " rai_text_job.outputs.ux_json.mode = \"upload\"\n", + "\n", + " return {\n", + " \"dashboard\": rai_text_job.outputs.dashboard,\n", + " \"ux_json\": rai_text_job.outputs.ux_json,\n", + " }" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6b5b14a9", + "metadata": {}, + "source": [ + "Next, we define the pipeline object itself, and ensure that the outputs will be available for download:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4d86ec2", + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "from azure.ai.ml import Output\n", + "\n", + "insights_pipeline_job = rai_squad_question_answering_pipeline(\n", + " target_column_name=target_column_name,\n", + " test_data=squad_test_mltable,\n", + " use_model_dependency=True,\n", + " classes=\"[]\",\n", + ")\n", + "\n", + "rand_path = str(uuid.uuid4())\n", + "insights_pipeline_job.outputs.dashboard = Output(\n", + " path=f\"azureml://datastores/workspaceblobstore/paths/{rand_path}/dashboard/\",\n", + " mode=\"upload\",\n", + " type=\"uri_folder\",\n", + ")\n", + "insights_pipeline_job.outputs.ux_json = Output(\n", + " path=f\"azureml://datastores/workspaceblobstore/paths/{rand_path}/ux_json/\",\n", + " mode=\"upload\",\n", + " type=\"uri_folder\",\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "25f34573", + "metadata": {}, + "source": [ + "And submit the pipeline to AzureML for execution:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ca757f7", + "metadata": {}, + "outputs": [], + "source": [ + "insights_job = submit_and_wait(ml_client, insights_pipeline_job)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1381768a", + "metadata": {}, + "source": [ + "The dashboard should appear in the AzureML portal in the registered model view. The following cell computes the expected URI:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e86ab611", + "metadata": {}, + "outputs": [], + "source": [ + "sub_id = ml_client._operation_scope.subscription_id\n", + "rg_name = ml_client._operation_scope.resource_group_name\n", + "ws_name = ml_client.workspace_name\n", + "\n", + "expected_uri = f\"https://ml.azure.com/model/{expected_model_id}/model_analysis?wsid=/subscriptions/{sub_id}/resourcegroups/{rg_name}/workspaces/{ws_name}\"\n", + "\n", + "print(f\"Please visit {expected_uri} to see your analysis\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "93a8dff9", + "metadata": {}, + "source": [ + "## Constructing the pipeline in YAML\n", + "\n", + "It is also possible to specify the pipeline as a YAML file, and submit that using the command line. We will now create a YAML specification of the above pipeline and submit that:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "624bb0cd", + "metadata": {}, + "outputs": [], + "source": [ + "yaml_contents = f\"\"\"\n", + "$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json\n", + "experiment_name: RAI_Squad_Example_RAIInsights_Computation_{rai_example_version_string}\n", + "type: pipeline\n", + "\n", + "compute: azureml:cpucluster\n", + "\n", + "inputs:\n", + " registered_model_name: hf_qa_model\n", + " hf_model_info: {expected_model_id}\n", + " my_test_data:\n", + " type: mltable\n", + " path: azureml:{input_test_data}:{rai_example_version_string}\n", + " mode: download\n", + "\n", + "settings:\n", + " default_datastore: azureml:workspaceblobstore\n", + " default_compute: azureml:cpucluster\n", + " continue_on_step_failure: false\n", + "\n", + "jobs:\n", + " analyse_model:\n", + " type: command\n", + " component: azureml://registries/azureml/components/rai_text_insights/versions/{version_string}\n", + " inputs:\n", + " task_type: question_answering\n", + " model_input:\n", + " type: mlflow_model\n", + " path: {azureml_model_id}\n", + " model_info: ${{{{parent.inputs.hf_model_info}}}}\n", + " test_dataset: ${{{{parent.inputs.my_test_data}}}}\n", + " target_column_name: {target_column_name}\n", + " maximum_rows_for_test_dataset: 5000\n", + " classes: '[]'\n", + " enable_explanation: True\n", + " enable_error_analysis: True\n", + "\"\"\"\n", + "\n", + "yaml_pipeline_filename = \"rai_text_example.yaml\"\n", + "\n", + "with open(yaml_pipeline_filename, \"w\") as f:\n", + " f.write(yaml_contents)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1fd5f2dd", + "metadata": {}, + "source": [ + "The created file can then be submitted using the Azure CLI:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3bf9bb1c", + "metadata": {}, + "outputs": [], + "source": [ + "cmd_line = [\n", + " \"az\",\n", + " \"ml\",\n", + " \"job\",\n", + " \"create\",\n", + " \"--resource-group\",\n", + " rg_name,\n", + " \"--workspace\",\n", + " ws_name,\n", + " \"--file\",\n", + " yaml_pipeline_filename,\n", + "]\n", + "\n", + "import subprocess\n", + "\n", + "try:\n", + " cmd = subprocess.run(cmd_line, check=True, shell=True, capture_output=True)\n", + "except subprocess.CalledProcessError as cpe:\n", + " print(f\"Error invoking: {cpe.args}\")\n", + " print(cpe.stdout)\n", + " print(cpe.stderr)\n", + " raise\n", + "else:\n", + " print(\"Azure CLI submission completed\")" + ] + } + ], + "metadata": { + "celltoolbar": "Raw Cell Format", + "kernelspec": { + "display_name": "Python 3.10 - SDK V2", + "language": "python", + "name": "python310-sdkv2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.15" + }, + "vscode": { + "interpreter": { + "hash": "8fd340b5477ca1a0b454d48a3973beff39fee032ada47a04f6f3725b469a8988" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}