diff --git a/charts/dcgm-exporter/.helmignore b/charts/dcgm-exporter/.helmignore new file mode 100644 index 000000000000..0e8a0eb36f4c --- /dev/null +++ b/charts/dcgm-exporter/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/dcgm-exporter/Chart.yaml b/charts/dcgm-exporter/Chart.yaml new file mode 100644 index 000000000000..21caf0ff900a --- /dev/null +++ b/charts/dcgm-exporter/Chart.yaml @@ -0,0 +1,20 @@ +apiVersion: v2 +appVersion: 3.4.0 +description: A Helm chart for DCGM exporter +home: https://github.com/nvidia/dcgm-exporter/ +icon: https://assets.nvidiagrid.net/ngc/logos/DCGM.png +keywords: +- gpu +- cuda +- compute +- monitoring +- telemetry +- tesla +kubeVersion: '>= 1.19.0-0' +name: dcgm-exporter +maintainers: + - name: glowkey + - name: nvvfedorov +sources: +- https://github.com/nvidia/dcgm-exporter +version: 3.4.0 diff --git a/charts/dcgm-exporter/templates/NOTES.txt b/charts/dcgm-exporter/templates/NOTES.txt new file mode 100644 index 000000000000..3cec096648d8 --- /dev/null +++ b/charts/dcgm-exporter/templates/NOTES.txt @@ -0,0 +1,15 @@ +1. Get the application URL by running these commands: +{{- if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ include "dcgm-exporter.namespace" . }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "dcgm-exporter.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ include "dcgm-exporter.namespace" . }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT/metrics +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get --namespace {{ include "dcgm-exporter.namespace" . }} svc -w {{ include "dcgm-exporter.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ include "dcgm-exporter.namespace" . }} {{ include "dcgm-exporter.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods -n {{ include "dcgm-exporter.namespace" . }} -l "app.kubernetes.io/name={{ include "dcgm-exporter.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + kubectl -n {{ include "dcgm-exporter.namespace" . }} port-forward $POD_NAME 8080:{{ .Values.service.port }} & + echo "Visit http://127.0.0.1:8080/metrics to use your application" +{{- end }} diff --git a/charts/dcgm-exporter/templates/_helpers.tpl b/charts/dcgm-exporter/templates/_helpers.tpl new file mode 100644 index 000000000000..ff71dd0494e3 --- /dev/null +++ b/charts/dcgm-exporter/templates/_helpers.tpl @@ -0,0 +1,75 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "dcgm-exporter.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "dcgm-exporter.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + + +{{/* +Allow the release namespace to be overridden for multi-namespace deployments in combined charts +*/}} +{{- define "dcgm-exporter.namespace" -}} + {{- if .Values.namespaceOverride -}} + {{- .Values.namespaceOverride -}} + {{- else -}} + {{- .Release.Namespace -}} + {{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "dcgm-exporter.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Common labels +*/}} +{{- define "dcgm-exporter.labels" -}} +helm.sh/chart: {{ include "dcgm-exporter.chart" . }} +{{ include "dcgm-exporter.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end -}} + +{{/* +Selector labels +*/}} +{{- define "dcgm-exporter.selectorLabels" -}} +app.kubernetes.io/name: {{ include "dcgm-exporter.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end -}} + +{{/* +Create the name of the service account to use +*/}} +{{- define "dcgm-exporter.serviceAccountName" -}} +{{- if .Values.serviceAccount.create -}} + {{ default (include "dcgm-exporter.fullname" .) .Values.serviceAccount.name }} +{{- else -}} + {{ default "default" .Values.serviceAccount.name }} +{{- end -}} +{{- end -}} diff --git a/charts/dcgm-exporter/templates/daemonset.yaml b/charts/dcgm-exporter/templates/daemonset.yaml new file mode 100644 index 000000000000..d55e000f86ee --- /dev/null +++ b/charts/dcgm-exporter/templates/daemonset.yaml @@ -0,0 +1,135 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "dcgm-exporter.fullname" . }} + namespace: {{ include "dcgm-exporter.namespace" . }} + labels: + {{- include "dcgm-exporter.labels" . | nindent 4 }} + app.kubernetes.io/component: "dcgm-exporter" +spec: + updateStrategy: + type: RollingUpdate + {{- with .Values.rollingUpdate }} + rollingUpdate: + maxUnavailable: {{ .maxUnavailable }} + maxSurge: {{ .maxSurge }} + {{- end }} + selector: + matchLabels: + {{- include "dcgm-exporter.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: "dcgm-exporter" + template: + metadata: + labels: + {{- include "dcgm-exporter.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: "dcgm-exporter" + {{- if .Values.podLabels }} + {{- toYaml .Values.podLabels | nindent 8 }} + {{- end }} + {{- if .Values.podAnnotations }} + annotations: + {{- toYaml .Values.podAnnotations | nindent 8 }} + {{- end }} + spec: + {{- if .Values.runtimeClassName }} + runtimeClassName: {{ .Values.runtimeClassName }} + {{- end }} + priorityClassName: {{ .Values.priorityClassName | default "system-node-critical" }} + {{- if .Values.hostNetwork }} + hostNetwork: {{ .Values.hostNetwork }} + dnsPolicy: ClusterFirstWithHostNet + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "dcgm-exporter.serviceAccountName" . }} + {{- if .Values.podSecurityContext }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + {{- end }} + {{- if .Values.affinity }} + affinity: + {{- toYaml .Values.affinity | nindent 8 }} + {{- end }} + {{- if .Values.nodeSelector }} + nodeSelector: + {{- toYaml .Values.nodeSelector | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 6 }} + {{- end }} + volumes: + - name: "pod-gpu-resources" + hostPath: + path: {{ .Values.kubeletPath }} + {{- range .Values.extraHostVolumes }} + - name: {{ .name | quote }} + hostPath: + path: {{ .hostPath | quote }} + {{- end }} + {{- with .Values.extraConfigMapVolumes }} + {{- toYaml . | nindent 6 }} + {{- end }} + containers: + - name: exporter + securityContext: + {{- toYaml .Values.securityContext | nindent 10 }} + image: "{{ .Values.global.imageRegistry | default .Values.image.registry }}/{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: "{{ .Values.image.pullPolicy }}" + args: + {{- range $.Values.arguments }} + - {{ . }} + {{- end }} + env: + - name: "DCGM_EXPORTER_KUBERNETES" + value: "true" + - name: "DCGM_EXPORTER_LISTEN" + value: "{{ .Values.service.address }}" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + {{- if .Values.extraEnv }} + {{- toYaml .Values.extraEnv | nindent 8 }} + {{- end }} + ports: + - name: "metrics" + containerPort: {{ .Values.service.port }} + volumeMounts: + - name: "pod-gpu-resources" + readOnly: true + mountPath: "/var/lib/kubelet/pod-resources" + {{- if .Values.extraVolumeMounts }} + {{- toYaml .Values.extraVolumeMounts | nindent 8 }} + {{- end }} + livenessProbe: + httpGet: + path: /health + port: {{ .Values.service.port }} + initialDelaySeconds: 45 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: {{ .Values.service.port }} + initialDelaySeconds: 45 + {{- if .Values.resources }} + resources: + {{- toYaml .Values.resources | nindent 10 }} + {{- end }} diff --git a/charts/dcgm-exporter/templates/extra-objects.yaml b/charts/dcgm-exporter/templates/extra-objects.yaml new file mode 100644 index 000000000000..928593c35b61 --- /dev/null +++ b/charts/dcgm-exporter/templates/extra-objects.yaml @@ -0,0 +1,4 @@ +{{ range .Values.extraManifests }} +--- +{{ tpl (toYaml .) $ }} +{{ end }} \ No newline at end of file diff --git a/charts/dcgm-exporter/templates/metrics-configmap.yaml b/charts/dcgm-exporter/templates/metrics-configmap.yaml new file mode 100644 index 000000000000..faf8cfd84048 --- /dev/null +++ b/charts/dcgm-exporter/templates/metrics-configmap.yaml @@ -0,0 +1,85 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: exporter-metrics-config-map + namespace: {{ include "dcgm-exporter.namespace" . }} +data: + metrics: | + # Format + # If line starts with a '#' it is considered a comment + # DCGM FIELD, Prometheus metric type, help message + + # Clocks + DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). + DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). + + # Temperature + DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). + DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). + + # Power + DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). + DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). + + # PCIE + # DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML. + # DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML. + DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. + + # Utilization (the sample period varies depending on the product) + DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). + DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). + DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). + DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). + + # Errors and violations + DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. + # DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). + # DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). + # DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). + # DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). + # DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). + # DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). + + # Memory usage + DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). + DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). + + # ECC + # DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. + # DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. + # DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. + # DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. + + # Retired pages + # DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. + # DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. + # DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. + + # NVLink + # DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. + # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. + # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. + # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. + DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. + # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. + + # VGPU License status + DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status + + # Remapped rows + DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors + DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors + DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed + + # DCP metrics + DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %). + # DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %). + # DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %). + DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %). + DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %). + # DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %). + # DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %). + # DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %). + DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload. + DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload. diff --git a/charts/dcgm-exporter/templates/role.yaml b/charts/dcgm-exporter/templates/role.yaml new file mode 100644 index 000000000000..4b1e0c985c29 --- /dev/null +++ b/charts/dcgm-exporter/templates/role.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: dcgm-exporter-read-cm + namespace: {{ include "dcgm-exporter.namespace" . }} + labels: + {{- include "dcgm-exporter.labels" . | nindent 4 }} + app.kubernetes.io/component: "dcgm-exporter" +rules: +- apiGroups: [""] + resources: ["configmaps"] + resourceNames: ["exporter-metrics-config-map"] + verbs: ["get"] diff --git a/charts/dcgm-exporter/templates/rolebinding.yaml b/charts/dcgm-exporter/templates/rolebinding.yaml new file mode 100644 index 000000000000..cb76feecbb47 --- /dev/null +++ b/charts/dcgm-exporter/templates/rolebinding.yaml @@ -0,0 +1,16 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "dcgm-exporter.fullname" . }} + namespace: {{ include "dcgm-exporter.namespace" . }} + labels: + {{- include "dcgm-exporter.labels" . | nindent 4 }} + app.kubernetes.io/component: "dcgm-exporter" +subjects: +- kind: ServiceAccount + name: {{ include "dcgm-exporter.serviceAccountName" . }} + namespace: {{ include "dcgm-exporter.namespace" . }} +roleRef: + kind: Role + name: dcgm-exporter-read-cm + apiGroup: rbac.authorization.k8s.io diff --git a/charts/dcgm-exporter/templates/service-monitor.yaml b/charts/dcgm-exporter/templates/service-monitor.yaml new file mode 100644 index 000000000000..6a2628bf679a --- /dev/null +++ b/charts/dcgm-exporter/templates/service-monitor.yaml @@ -0,0 +1,42 @@ +{{- if and (.Values.serviceMonitor.enabled) (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor") }} +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "dcgm-exporter.fullname" . }} + namespace: {{ include "dcgm-exporter.namespace" . }} + labels: + {{- include "dcgm-exporter.labels" . | nindent 4 }} + app.kubernetes.io/component: "dcgm-exporter" + {{- if .Values.serviceMonitor.additionalLabels }} + {{- toYaml .Values.serviceMonitor.additionalLabels | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + {{- include "dcgm-exporter.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: "dcgm-exporter" + namespaceSelector: + matchNames: + - "{{ include "dcgm-exporter.namespace" . }}" + endpoints: + - port: "metrics" + path: "/metrics" + interval: "{{ .Values.serviceMonitor.interval }}" + honorLabels: {{ .Values.serviceMonitor.honorLabels }} + relabelings: + {{ toYaml .Values.serviceMonitor.relabelings | nindent 6 }} +{{- end -}} diff --git a/charts/dcgm-exporter/templates/service.yaml b/charts/dcgm-exporter/templates/service.yaml new file mode 100644 index 000000000000..0ea2e34a4170 --- /dev/null +++ b/charts/dcgm-exporter/templates/service.yaml @@ -0,0 +1,37 @@ +{{- if .Values.service.enable }} +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + name: {{ include "dcgm-exporter.fullname" . }} + namespace: {{ include "dcgm-exporter.namespace" . }} + labels: + {{- include "dcgm-exporter.labels" . | nindent 4 }} + app.kubernetes.io/component: "dcgm-exporter" + {{- with .Values.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + type: {{ .Values.service.type }} + ports: + - name: "metrics" + port: {{ .Values.service.port }} + targetPort: {{ .Values.service.port }} + protocol: TCP + selector: + {{- include "dcgm-exporter.selectorLabels" . | nindent 4 }} +{{- end }} diff --git a/charts/dcgm-exporter/templates/serviceaccount.yaml b/charts/dcgm-exporter/templates/serviceaccount.yaml new file mode 100644 index 000000000000..54c85697f793 --- /dev/null +++ b/charts/dcgm-exporter/templates/serviceaccount.yaml @@ -0,0 +1,28 @@ +{{- if .Values.serviceAccount.create -}} +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "dcgm-exporter.serviceAccountName" . }} + namespace: {{ include "dcgm-exporter.namespace" . }} + labels: + {{- include "dcgm-exporter.labels" . | nindent 4 }} + app.kubernetes.io/component: "dcgm-exporter" + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end -}} diff --git a/charts/dcgm-exporter/values.yaml b/charts/dcgm-exporter/values.yaml new file mode 100644 index 000000000000..3968e0fa8e8b --- /dev/null +++ b/charts/dcgm-exporter/values.yaml @@ -0,0 +1,264 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +global: + imageRegistry: "" + imagePullSecrets: [] + +image: + registry: docker.io + repository: kubesphere/dcgm-exporter + pullPolicy: IfNotPresent + # Image tag defaults to AppVersion, but you can use the tag key + # for the image tag, e.g: + tag: 3.3.5-3.4.0-ubuntu22.04 + +# Change the following reference to "/etc/dcgm-exporter/default-counters.csv" +# to stop profiling metrics from DCGM +arguments: + - "-f" + - "/etc/dcgm-exporter/custom-dcgm-metrics.csv" +# default arguments: +# ["-f", "/etc/dcgm-exporter/dcp-metrics-included.csv"] +# NOTE: in general, add any command line arguments to arguments above +# and they will be passed through. +# Use "-r", ":" to connect to an already running hostengine +# Example arguments: ["-r", "host123:5555"] +# Use "-n" to remove the hostname tag from the output. +# Example arguments: ["-n"] +# Use "-d" to specify the devices to monitor. -d must be followed by a string +# in the following format: [f] or [g[:numeric_range][+]][i[:numeric_range]] +# Where a numeric range is something like 0-4 or 0,2,4, etc. +# Example arguments: ["-d", "g+i"] to monitor all GPUs and GPU instances or +# ["-d", "g:0-3"] to monitor GPUs 0-3. +# Use "-m" to specify the namespace and name of a configmap containing +# the watched exporter fields. +# Example arguments: ["-m", "default:exporter-metrics-config-map"] + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" +namespaceOverride: "" + +runtimeClassName: "" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + +rollingUpdate: + # Specifies maximum number of DaemonSet pods that can be unavailable during the update + maxUnavailable: 1 + # Specifies maximum number of nodes with an existing available DaemonSet pod that can have an updated DaemonSet pod during during an update + maxSurge: 0 + +podLabels: {} + +podAnnotations: {} +# Using this annotation which is required for prometheus scraping + # prometheus.io/scrape: "true" + # prometheus.io/port: "9400" + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: + runAsNonRoot: false + runAsUser: 0 + capabilities: + add: ["SYS_ADMIN"] + # readOnlyRootFilesystem: true + +service: + enable: true + type: ClusterIP + port: 9400 + address: ":9400" + # Annotations to add to the service + annotations: {} + +resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + +serviceMonitor: + enabled: true + interval: 15s + honorLabels: true + additionalLabels: {} + # monitoring: prometheus + relabelings: [] + # - sourceLabels: [__meta_kubernetes_pod_node_name] + # separator: ; + # regex: ^(.*)$ + # targetLabel: nodename + # replacement: $1 + # action: replace + +nodeSelector: {} + # node: gpu + +tolerations: [] + # - operator: Exists + +affinity: {} + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: nvidia-gpu + # operator: Exists + +extraHostVolumes: [] + # - name: host-binaries + # hostPath: /opt/bin + +extraConfigMapVolumes: + - name: custom-dcgm-metrics + configMap: + name: custom-dcgm-metrics-configmap + +extraVolumeMounts: + - name: custom-dcgm-metrics + mountPath: /etc/dcgm-exporter/custom-dcgm-metrics.csv + subPath: custom-dcgm-metrics.csv + readOnly: true + +extraEnv: [] + # - name: EXTRA_VAR + # value: "TheStringValue" + +kubeletPath: "/var/lib/kubelet/pod-resources" + + +extraManifests: + - apiVersion: v1 + kind: ConfigMap + metadata: + name: custom-dcgm-metrics-configmap + namespace: '{{ include "dcgm-exporter.namespace" . }}' + data: + custom-dcgm-metrics.csv: | + # Format + # If line starts with a '#' it is considered a comment + # DCGM FIELD, Prometheus metric type, help message + + # Clocks + DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). + DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). + + # Temperature + DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). + DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). + + # Power + DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). + DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). + + # PCIE + # DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML. + # DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML. + DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. + + # Utilization (the sample period varies depending on the product) + DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). + DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). + DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). + DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). + + # Errors and violations + DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. + DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). + DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). + DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). + DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). + DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). + DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). + + # Memory usage + DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). + DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). + + # ECC + # DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. + # DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. + # DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. + # DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. + + # Retired pages + DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. + DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. + DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. + + # NVLink + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. + DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. + # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. + + # VGPU License status + DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status + + # Remapped rows + DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors + DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors + DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed + + # Static configuration information. These appear as labels on the other metrics + DCGM_FI_DRIVER_VERSION, label, Driver Version + DCGM_FI_NVML_VERSION, label, NVML Version + DCGM_FI_DEV_BRAND, label, Device Brand + DCGM_FI_DEV_SERIAL, label, Device Serial Number + DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version + DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version + DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version + DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version + DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device + + # DCP metrics + DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %). + DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %). + DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %). + DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %). + DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %). + DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %). + DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %). + DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %). + DCGM_FI_PROF_PCIE_TX_BYTES, gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. + DCGM_FI_PROF_PCIE_RX_BYTES, gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. + + + # Additional recommended fields + DCGM_FI_DEV_COUNT ,counter ,Number of Devices on the node. + DCGM_FI_DEV_POWER_MGMT_LIMIT ,gauge ,Current power limit for the device. + DCGM_FI_DEV_PSTATE ,gauge ,Performance state (P-State) 0-15. 0=highest + DCGM_FI_DEV_FB_TOTAL ,gauge , + DCGM_FI_DEV_FB_RESERVED ,gauge , + DCGM_FI_DEV_FB_USED_PERCENT ,gauge , + DCGM_FI_DEV_CLOCK_THROTTLE_REASONS ,gauge ,Current clock throttle reasons + + DCGM_FI_PROCESS_NAME ,label ,The Process Name. + DCGM_FI_CUDA_DRIVER_VERSION ,label ,CUDA driver version + DCGM_FI_DEV_NAME ,label ,The Device Name