From e9dc58a7c8f7363109f8ffda8ae4dc915dbb0675 Mon Sep 17 00:00:00 2001 From: kkrishTa <56536056+kkrishTa@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:27:09 +0530 Subject: [PATCH] Azure automated deployment for OPEA applications - Infosys (#629) * CI: dump all containers' log for chart test (#627) Signed-off-by: Lianhao Lu Signed-off-by: kkrishTa * AKS Deployment for OPEA applications Signed-off-by: kkrishTa * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: kkrishTa * Reverted changes by pre commit hook. Updated readme as per suggestions Signed-off-by: kkrishTa * Adding code spell ignore for Azure Kubernetes Service Signed-off-by: kkrishTa * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: kkrishTa * Adding code spell ignore for AKS and reverting pre commit changes Signed-off-by: kkrishTa * Updating cluster name to opea as a generic name Signed-off-by: kkrishTa * helm chart: Add service account support (#624) * helm: Add service account support in common services 1. Add service account creation support, disabled by default. 2. Add support of sharing the same service account by setting global.sharedSAName, disabled by default. Signed-off-by: Lianhao Lu * helm: Add service account support in e2e charts 1. Add service account creation support, enabled by default. 2. Add support of sharing the same service account by setting global.sharedSAName, enabled by default. Signed-off-by: Lianhao Lu --------- Signed-off-by: Lianhao Lu Signed-off-by: kkrishTa * README: add links to terraform docs (#633) * README: add links to terraform docs Signed-off-by: Sakari Poussa * README: fix broken links Signed-off-by: Sakari Poussa * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Sakari Poussa Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: kkrishTa * Adding link to the Azure terraform readme to opea documentation Signed-off-by: kkrishTa * Corrected md file link Signed-off-by: kkrishTa * Updated relative links Signed-off-by: kkrishTa --------- Signed-off-by: Lianhao Lu Signed-off-by: kkrishTa Signed-off-by: Sakari Poussa Co-authored-by: Lianhao Lu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Sakari Poussa --- .github/code_spell_ignore.txt | 2 + README.md | 7 +- .../azure/aks/terraform/README.md | 84 +++++++++++++ .../azure/aks/terraform/aks-azfs-csi-pvc.yaml | 14 +++ .../azure/aks/terraform/azure_main.tf | 113 ++++++++++++++++++ .../azure/aks/terraform/opea-chatqna.tfvars | 6 + .../azure/aks/terraform/outputs.tf | 21 ++++ .../azure/aks/terraform/terraform.tf | 18 +++ .../azure/aks/terraform/variables.tf | 83 +++++++++++++ 9 files changed, 345 insertions(+), 3 deletions(-) create mode 100644 cloud-service-provider/azure/aks/terraform/README.md create mode 100644 cloud-service-provider/azure/aks/terraform/aks-azfs-csi-pvc.yaml create mode 100644 cloud-service-provider/azure/aks/terraform/azure_main.tf create mode 100644 cloud-service-provider/azure/aks/terraform/opea-chatqna.tfvars create mode 100644 cloud-service-provider/azure/aks/terraform/outputs.tf create mode 100644 cloud-service-provider/azure/aks/terraform/terraform.tf create mode 100644 cloud-service-provider/azure/aks/terraform/variables.tf diff --git a/.github/code_spell_ignore.txt b/.github/code_spell_ignore.txt index e69de29bb..0283c7491 100644 --- a/.github/code_spell_ignore.txt +++ b/.github/code_spell_ignore.txt @@ -0,0 +1,2 @@ +aks +AKS diff --git a/README.md b/README.md index def720b5e..e2a0a5f52 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ The following steps are optional. They're only required if you want to run the w ### Use GenAI Microservices Connector (GMC) to deploy and adjust GenAIExamples -Follow [GMC README](https://github.com/opea-project/GenAIInfra/blob/main/microservices-connector/README.md) +Follow [GMC README](microservices-connector/README.md) to install GMC into your kubernetes cluster. [GenAIExamples](https://github.com/opea-project/GenAIExamples) contains several sample GenAI example use case pipelines such as ChatQnA, DocSum, etc. Once you have deployed GMC in your Kubernetes cluster, you can deploy any of the example pipelines by following its Readme file (e.g. [Docsum](https://github.com/opea-project/GenAIExamples/blob/main/DocSum/kubernetes/intel/README_gmc.md)). @@ -55,13 +55,14 @@ Once you have deployed GMC in your Kubernetes cluster, you can deploy any of the To deploy GenAIExamples to Kubernetes using helm charts, you need [Helm](https://helm.sh/docs/intro/install/) installed on your machine. -For a detailed version, see [Deploy GenAIExample/GenAIComps using helm charts](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts/README.md) +For a detailed version, see [Deploy GenAIExample/GenAIComps using helm charts](helm-charts/README.md) ### Use terraform to deploy on cloud service providers You can use [Terraform](https://www.terraform.io/) to create infrastructure to run OPEA applications on various cloud service provider (CSP) environments. -- [AWS/EKS: Create managed Kubernetes cluster on AWS for OPEA](https://github.com/opea-project/GenAIInfra/blob/main/cloud-service-provider/aws/eks/terraform/README.MD) +- [AWS/EKS: Create managed Kubernetes cluster on AWS for OPEA](cloud-service-provider/aws/eks/terraform/README.MD) +- [Azure/AKS: Create managed Kubernetes cluster on Azure for OPEA](cloud-service-provider/azure/aks/terraform/README.md) ## Additional Content diff --git a/cloud-service-provider/azure/aks/terraform/README.md b/cloud-service-provider/azure/aks/terraform/README.md new file mode 100644 index 000000000..0eb1087c5 --- /dev/null +++ b/cloud-service-provider/azure/aks/terraform/README.md @@ -0,0 +1,84 @@ +# OPEA applications Azure AKS deployment guide + +This guide shows how to deploy OPEA applications on Azure Kubernetes Service (AKS) using Terraform. + +## Prerequisites + +- Access to Azure AKS +- [Terraform](https://developer.hashicorp.com/terraform/tutorials/azure-get-started/install-cli), [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/) and [Helm](https://helm.sh/docs/helm/helm_install/) installed on your local machine. +- Keep the Azure subscription handy and enter the subscription id when prompted during the terraform execution. + +## Setup + +The setup uses Terraform to create AKS cluster with the following properties: + +- 1-node AKS cluster with 50 GB disk and `Standard_D32d_v5` SPOT (or standard based on the application variables) instance (16 vCPU and 32 GB memory) +- Cluster autoscaling up to 10 nodes +- Storage Class (SC) `azurefile-csi` and Persistent Volume Claim (PVC) `model-volume` for storing the model data + +Initialize the Terraform environment. + +```bash +terraform init +``` + +## AKS cluster + +By default, 1-node cluster is created which is suitable for running the OPEA application. See `variables.tf` and `opea-.tfvars` if you want to tune the cluster properties, e.g., number of nodes, instance types or disk size. + +## Persistent Volume Claim + +OPEA needs a volume where to store the model. For that we need to create Kubernetes Persistent Volume Claim (PVC). OPEA requires `ReadWriteMany` option since multiple pods needs access to the storage and they can be on different nodes. On AKS, only Azure File Service supports `ReadWriteMany`. Thus, each OPEA application below uses the file `aks-azfs-csi-pvc.yaml` to create PVC in its namespace. + +## OPEA Applications + +### ChatQnA + +Use the commands below to create AKS cluster. +User has to input their Azure subscription id while running the following commands when prompted. + +```bash +terraform plan --var-file opea-chatqna.tfvars -out opea-chatqna.plan +terraform apply "opea-chatqna.plan" +``` + +Once the cluster is ready, the kubeconfig file to access the new cluster is updated automatically. By default, the file is `~/.kube/config`. + +Now you should have access to the cluster via the `kubectl` command. + +Deploy ChatQnA Application with Helm + +```bash +helm install -n chatqna --create-namespace chatqna oci://ghcr.io/opea-project/charts/chatqna --set service.type=LoadBalancer --set global.modelUsePVC=model-volume --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} +``` + +Create the PVC as mentioned [above](#-persistent-volume-claim) + +```bash +kubectl apply -f aks-azfs-csi-pvc.yaml -n chatqna +``` + +After a while, the OPEA application should be running. You can check the status via `kubectl`. + +```bash +kubectl get pod -n chatqna +``` + +Ensure that all pods are running. +You can now start using the OPEA application. + +```bash +OPEA_SERVICE=$(kubectl get svc -n chatqna chatqna -ojsonpath='{.status.loadBalancer.ingress[0].hostname}') +curl http://${OPEA_SERVICE}:8888/v1/chatqna \ + -H "Content-Type: application/json" \ + -d '{"messages": "What is the revenue of Nike in 2023?"}' +``` + +Cleanup + +Delete the cluster via the following command. User has to input their Azure subscription id while running the following commands when prompted. + +```bash +helm uninstall -n chatqna chatqna +terraform destroy -var-file opea-chatqna.tfvars +``` diff --git a/cloud-service-provider/azure/aks/terraform/aks-azfs-csi-pvc.yaml b/cloud-service-provider/azure/aks/terraform/aks-azfs-csi-pvc.yaml new file mode 100644 index 000000000..d98e0219a --- /dev/null +++ b/cloud-service-provider/azure/aks/terraform/aks-azfs-csi-pvc.yaml @@ -0,0 +1,14 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: model-volume +spec: + accessModes: + - ReadWriteMany + storageClassName: azurefile-csi + resources: + requests: + storage: 100Gi diff --git a/cloud-service-provider/azure/aks/terraform/azure_main.tf b/cloud-service-provider/azure/aks/terraform/azure_main.tf new file mode 100644 index 000000000..b0b3746b3 --- /dev/null +++ b/cloud-service-provider/azure/aks/terraform/azure_main.tf @@ -0,0 +1,113 @@ +provider "kubernetes" { + config_path = "~/.kube/config" +} + +# Resource Group +resource "azurerm_resource_group" "main" { + name = "${var.cluster_name}-rg" + location = var.location +} + +# Virtual Network +module "vnet" { + source = "Azure/vnet/azurerm" + resource_group_name = azurerm_resource_group.main.name + vnet_name = "${var.cluster_name}-vnet" + vnet_location = azurerm_resource_group.main.location + + tags = { + environment = "dev" + } + depends_on = [azurerm_resource_group.main] +} + +# AKS Cluster +resource "azurerm_kubernetes_cluster" "main" { + name = var.cluster_name + location = azurerm_resource_group.main.location + resource_group_name = azurerm_resource_group.main.name + dns_prefix = var.cluster_name + kubernetes_version = var.cluster_version + private_cluster_public_fqdn_enabled = true + + default_node_pool { + name = "default" + auto_scaling_enabled = true + node_count = var.node_count + vm_size = var.instance_types[0] + min_count = var.min_count + max_count = var.max_count + vnet_subnet_id = module.vnet.vnet_subnets[0] + os_disk_size_gb = var.os_disk_size_gb + } + + identity { + type = "SystemAssigned" + } + + network_profile { + network_plugin = "azure" + load_balancer_sku = "standard" + service_cidr = "10.0.4.0/24" + dns_service_ip = "10.0.4.10" + } + +} + +# Azure Files Storage Account +resource "azurerm_storage_account" "main" { + name = replace(lower("${var.cluster_name}st"), "-", "") + resource_group_name = azurerm_resource_group.main.name + location = azurerm_resource_group.main.location + account_tier = "Premium" + account_replication_type = "LRS" + account_kind = "FileStorage" +} + +# Azure Files Share +resource "azurerm_storage_share" "main" { + name = "aksshare" + storage_account_id = azurerm_storage_account.main.id + quota = 100 +} + +# Key Vault +resource "azurerm_key_vault" "main" { + name = "${var.cluster_name}-kv" + location = azurerm_resource_group.main.location + resource_group_name = azurerm_resource_group.main.name + tenant_id = data.azurerm_client_config.current.tenant_id + sku_name = "standard" + soft_delete_retention_days = 7 + purge_protection_enabled = false + + access_policy { + tenant_id = data.azurerm_client_config.current.tenant_id + object_id = data.azurerm_client_config.current.object_id + + key_permissions = [ + "Create", + "Delete", + "Get", + "List", + ] + + secret_permissions = [ + "Set", + "Get", + "Delete", + "List", + ] + } +} + +# Update kubeconfig +resource "null_resource" "kubectl" { + provisioner "local-exec" { + command = "az aks get-credentials --resource-group ${azurerm_resource_group.main.name} --name ${azurerm_kubernetes_cluster.main.name} --overwrite-existing" + } + depends_on = [azurerm_kubernetes_cluster.main] +} + +# Data source for Azure subscription information +data "azurerm_client_config" "current" {} diff --git a/cloud-service-provider/azure/aks/terraform/opea-chatqna.tfvars b/cloud-service-provider/azure/aks/terraform/opea-chatqna.tfvars new file mode 100644 index 000000000..8ab5e70bc --- /dev/null +++ b/cloud-service-provider/azure/aks/terraform/opea-chatqna.tfvars @@ -0,0 +1,6 @@ +cluster_name = "opea" +instance_types = ["Standard_D32d_v5"] +node_pool_type = "Spot" # cheaper +os_disk_size_gb = 50 +location = "eastus" +kubernetes_version = "1.30" diff --git a/cloud-service-provider/azure/aks/terraform/outputs.tf b/cloud-service-provider/azure/aks/terraform/outputs.tf new file mode 100644 index 000000000..0ce0d9085 --- /dev/null +++ b/cloud-service-provider/azure/aks/terraform/outputs.tf @@ -0,0 +1,21 @@ +output "cluster_endpoint" { + description = "Endpoint for AKS control plane" + #sensitive = false + sensitive = true + value = azurerm_kubernetes_cluster.main.kube_config.0.host +} + +output "oidc_issuer_url" { + description = "The URL for the OpenID Connect issuer" + value = azurerm_kubernetes_cluster.main.oidc_issuer_url +} + +output "location" { + description = "Azure region" + value = var.location +} + +output "cluster_name" { + description = "Kubernetes Cluster Name" + value = azurerm_kubernetes_cluster.main.name +} diff --git a/cloud-service-provider/azure/aks/terraform/terraform.tf b/cloud-service-provider/azure/aks/terraform/terraform.tf new file mode 100644 index 000000000..6148a8147 --- /dev/null +++ b/cloud-service-provider/azure/aks/terraform/terraform.tf @@ -0,0 +1,18 @@ +terraform { + required_providers { + azurerm = { + source = "hashicorp/azurerm" + version = "~> 4.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "2.33.0" + } + } +} + +# Azure provider configuration +provider "azurerm" { + features {} + subscription_id = var.subscription_id +} diff --git a/cloud-service-provider/azure/aks/terraform/variables.tf b/cloud-service-provider/azure/aks/terraform/variables.tf new file mode 100644 index 000000000..ef3f8a45a --- /dev/null +++ b/cloud-service-provider/azure/aks/terraform/variables.tf @@ -0,0 +1,83 @@ +variable "location" { + description = "Azure region" + type = string + default = "eastus" +} + +variable "cluster_name" { + description = "AKS cluster name" + type = string + default = "opea aks cluster" +} + +variable "kubernetes_version" { + description = "AKS cluster version" + type = string + default = "1.30" +} + +variable "use_custom_node_config" { + description = "Enable custom node configuration" + type = bool + default = true +} + +variable "subscription_id" { + description = "This is the Azure subscription id of the user" + type = string +} + +variable "os_disk_size_gb" { + description = "OS disk size in GB for nodes" + type = number + default = 50 +} + +variable "node_pool_type" { + description = "VM spot or on-demand instance types" + type = string + default = "Regular" # Regular for on-demand, Spot for spot instances +} + +variable "min_count" { + description = "Minimum number of nodes" + type = number + default = 1 +} + +variable "max_count" { + description = "Maximum number of nodes" + type = number + default = 10 +} + +variable "node_count" { + description = "Desired number of nodes" + type = number + default = 1 +} + +variable "resource_group_name" { + description = "Name of the resource group" + type = string + default = null +} + +variable "vnet_subnet_id" { + description = "ID of the subnet where the cluster will be deployed" + type = string + default = null +} + + +variable "cluster_version" { + description = "Kubernetes version for the cluster" + type = string + default = "1.30" +} + +variable "instance_types" { + description = "Azure VM instance type" + type = list(string) + default = ["Standard_D32d_v5"] +}