aws-observability · bonclay7 · Jan 24, 2024 · Jan 17, 2024 · Jan 17, 2024 · Jan 17, 2024
diff --git a/docs/eks/gpu-monitoring.md b/docs/eks/gpu-monitoring.md
@@ -0,0 +1,41 @@
+# Monitoring NVIDIA GPU Workloads
+
+GPUs play an integral part in data intensive workloads. The base infrastructure module of the Observability Accelerator provides the ability to deploy the NVIDIA DCGM Exporter Dashboard.
+The dashboard utilizes metrics scraped from the `/metrics` endpoint that are exposed when running the nvidia gpu operator and NVSMI binary.
+
+!!!note
+    In order to make use of this dashboard, you will need to have a GPU backed EKS cluster and deploy the [GPU operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/amazon-eks.html)
+    The recommended way of deploying the GPU operator is the [Data on EKS Blueprint](https://github.com/aws-ia/terraform-aws-eks-data-addons/blob/main/nvidia-gpu-operator.tf)
+
+## Deployment
+
+This is enabled by default in the [eks-monitoring module](https://aws-observability.github.io/terraform-aws-observability-accelerator/eks/).
+
+## Dashboards
+
+In order to start producing diagnostic metrics you must first deploy the nvidia SMI binary. nvidia-smi (also NVSMI) provides monitoring and management capabilities for each of NVIDIA’s devices from Fermi and higher architecture families. We can now deploy the nvidia-smi binary, which shows diagnostic information about all GPUs visible to the container:
+
+```
+cat << EOF | kubectl apply -f -
+apiVersion: v1
+kind: Pod
+metadata:
+  name: nvidia-smi
+spec:
+  restartPolicy: OnFailure
+  containers:
+  - name: nvidia-smi
+    image: "nvidia/cuda:11.0.3-base-ubuntu20.04"
+    args:
+    - "nvidia-smi"
+    resources:
+      limits:
+        nvidia.com/gpu: 1
+EOF
+```
+After producing the metrics they should populate the DCGM exporter dashboard:
+
+![image](https://github.com/aws-observability/terraform-aws-observability-accelerator/assets/97046295/66e8ae83-3a78-48b8-a9fc-4460a5a4d173)
+
+
+
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -28,6 +28,7 @@ nav:
   - Amazon EKS:
       - Infrastructure: eks/index.md
       - EKS API server: eks/eks-apiserver.md
+      - EKS GPU montitoring: eks/gpumon.md
       - Multicluster:
           - Single AWS account: eks/multicluster.md
           - Cross AWS account: eks/multiaccount.md

diff --git a/modules/eks-monitoring/dashboards.tf b/modules/eks-monitoring/dashboards.tf
@@ -95,6 +95,26 @@ YAML
   depends_on = [module.external_secrets]
 }
 
+# gpu dashboards
+resource "kubectl_manifest" "gpu_monitoring_dashboards" {
+  yaml_body  = <<YAML
+apiVersion: kustomize.toolkit.fluxcd.io/v1beta2
+kind: Kustomization
+metadata:
+  name: ${local.gpu_monitoring_config.flux_kustomization_name}
+  namespace: flux-system
+spec:
+  interval: 1m0s
+  path: ${local.gpu_monitoring_config.flux_kustomization_path}
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: ${local.gpu_monitoring_config.flux_gitrepository_name}
+YAML
+  count      = var.enable_gpu_monitoring ? 1 : 0
+  depends_on = [module.external_secrets]
+}
+
 resource "kubectl_manifest" "kubeproxy_monitoring_dashboard" {
   yaml_body  = <<YAML
 apiVersion: kustomize.toolkit.fluxcd.io/v1beta2

diff --git a/modules/eks-monitoring/locals.tf b/modules/eks-monitoring/locals.tf
@@ -127,6 +127,15 @@ locals {
     }
   }
 
+  gpu_monitoring_config = {
+    # can be overriden by providing a config
+    flux_gitrepository_name   = try(var.gpu_monitoring_config.flux_gitrepository_name, var.flux_gitrepository_name)
+    flux_gitrepository_url    = try(var.gpu_monitoring_config.flux_gitrepository_url, var.flux_gitrepository_url)
+    flux_gitrepository_branch = try(var.gpu_monitoring_config.flux_gitrepository_branch, var.flux_gitrepository_branch)
+    flux_kustomization_name   = try(var.gpu_monitoring_config.flux_kustomization_name, "grafana-dashboards-adothealth")
+    flux_kustomization_path   = try(var.gpu_monitoring_config.flux_kustomization_path, "./artifacts/grafana-operator-manifests/eks/gpu")
+  }
+
   kubeproxy_monitoring_config = {
     # can be overriden by providing a config
     flux_gitrepository_name   = try(var.kubeproxy_monitoring_config.flux_gitrepository_name, var.flux_gitrepository_name)

diff --git a/modules/eks-monitoring/main.tf b/modules/eks-monitoring/main.tf
@@ -189,6 +189,10 @@ module "helm_addon" {
       name  = "enableAdotcollectorMetrics"
       value = var.enable_adotcollector_metrics
     },
+    {
+      name  = "enableGpuMonitoring"
+      value = var.enable_gpu_monitoring
+    },
     {
       name  = "serviceAccount"
       value = local.kube_service_account_name

diff --git a/modules/eks-monitoring/variables.tf b/modules/eks-monitoring/variables.tf
@@ -540,6 +540,26 @@ variable "enable_adotcollector_metrics" {
   default     = true
 }
 
+variable "enable_gpu_monitoring" {
+  description = "Enables monitoring of GPU metrics"
+  type        = bool
+  default     = true
+}
+
+variable "gpu_monitoring_config" {
+  description = "Config object for GPU monitoring"
+  type = object({
+    flux_gitrepository_name   = string
+    flux_gitrepository_url    = string
+    flux_gitrepository_branch = string
+    flux_kustomization_name   = string
+    flux_kustomization_path   = string
+  })
+
+  # defaults are pre-computed in locals.tf, provide a full definition to override
+  default = null
+}
+
 variable "adothealth_monitoring_config" {
   description = "Config object for ADOT health monitoring"
   type = object({