From 48781f522fb78f91e769bea5f6f8d09ce7f67136 Mon Sep 17 00:00:00 2001 From: Parampreet Singh <50599809+Paramadon@users.noreply.github.com> Date: Wed, 3 Jul 2024 09:27:23 -0400 Subject: [PATCH] EKS Addon Fargate Bug Fix (#58) --- .../linux/cloudwatch-agent-daemonset.yaml | 9 ++++ .../linux/dcgm-exporter-daemonset.yaml | 12 +++-- .../templates/linux/fluent-bit-daemonset.yaml | 9 ++++ .../linux/neuron-monitor-daemonset.yaml | 20 +++++---- .../values.yaml | 44 ++++++++++--------- 5 files changed, 61 insertions(+), 33 deletions(-) diff --git a/charts/amazon-cloudwatch-observability/templates/linux/cloudwatch-agent-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/cloudwatch-agent-daemonset.yaml index 84b26b7..c5298d4 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/cloudwatch-agent-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/cloudwatch-agent-daemonset.yaml @@ -30,6 +30,15 @@ spec: nodeSelector: kubernetes.io/os: linux serviceAccount: {{ template "cloudwatch-agent.serviceAccountName" . }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: {{ .Values.fargateLabelKey }} + operator: NotIn + values: + - fargate {{- if .Values.agent.config }} config: {{ include "cloudwatch-agent.modify-config" (merge (dict "Config" .Values.agent.config) . ) }} {{- else }} diff --git a/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml index 82ce49c..f812bb0 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml @@ -15,10 +15,14 @@ spec: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - - matchExpressions: - - key: {{ .Values.nodeLabelKey }} - operator: In - values: {{ .Values.gpuInstances | toYaml | nindent 16 }} + - matchExpressions: + - key: {{ .Values.nodeLabelKey }} + operator: In + values: {{ .Values.gpuInstances | toYaml | nindent 16 }} + - key: {{ .Values.fargateLabelKey }} + operator: NotIn + values: + - fargate resources: requests: cpu: 250m diff --git a/charts/amazon-cloudwatch-observability/templates/linux/fluent-bit-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/fluent-bit-daemonset.yaml index ed8a8d9..79d3698 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/fluent-bit-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/fluent-bit-daemonset.yaml @@ -95,6 +95,15 @@ spec: hostPath: path: /var/log/dmesg serviceAccountName: {{ template "cloudwatch-agent.serviceAccountName" . }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: {{ .Values.fargateLabelKey }} + operator: NotIn + values: + - fargate nodeSelector: kubernetes.io/os: linux {{- with .Values.tolerations }} diff --git a/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml index 262051b..686a608 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml @@ -14,13 +14,17 @@ spec: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - - key: kubernetes.io/os - operator: In - values: - - linux - - key: {{ .Values.nodeLabelKey }} - operator: In - values: {{ .Values.neuronInstances | toYaml | nindent 20 }} + - key: kubernetes.io/os + operator: In + values: + - linux + - key: {{ .Values.nodeLabelKey }} + operator: In + values: {{ .Values.neuronInstances | toYaml | nindent 20 }} + - key: {{ .Values.fargateLabelKey }} + operator: NotIn + values: + - fargate resources: limits: cpu: 500m @@ -41,7 +45,7 @@ spec: - name: "metrics" port: {{ .Values.neuronMonitor.service.port }} command: - - "/opt/bin/entrypoint.sh" + - "/opt/bin/entrypoint.sh" args: port: "{{ .Values.neuronMonitor.service.port }}" cert-file: "/etc/amazon-cloudwatch-observability-neuron-cert/server.crt" diff --git a/charts/amazon-cloudwatch-observability/values.yaml b/charts/amazon-cloudwatch-observability/values.yaml index c8ed497..284dd19 100644 --- a/charts/amazon-cloudwatch-observability/values.yaml +++ b/charts/amazon-cloudwatch-observability/values.yaml @@ -18,6 +18,8 @@ clusterName: region: nodeLabelKey: node.kubernetes.io/instance-type +fargateLabelKey: eks.amazonaws.com/compute-type + ## NVIDIA GPU instance types gpuInstances: [ p2.xlarge, p2.8xlarge, p2.16xlarge, p3.2xlarge, p3.8xlarge, p3.16xlarge, p3dn.24xlarge, p4d.24xlarge, p4de.24xlarge, p5.48xlarge, g3s.xlarge, g3.4xlarge, g3.8xlarge, g3.16xlarge, g4dn.xlarge, g4dn.2xlarge, g4dn.4xlarge, g4dn.8xlarge, g4dn.16xlarge, g4dn.12xlarge, g4dn.metal, g4ad.xlarge, g4ad.2xlarge, g4ad.4xlarge, g4ad.8xlarge, g4ad.16xlarge, g5.xlarge, g5.2xlarge, g5.4xlarge, g5.8xlarge, g5.16xlarge, g5.12xlarge, g5.24xlarge, g5.48xlarge, g5g.xlarge, g5g.2xlarge, g5g.4xlarge, g5g.8xlarge, g5g.16xlarge, g5g.metal, ml.p5.48xlarge, ml.p4d.24xlarge, ml.p4de.24xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.p3dn.24xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.g3s.xlarge, ml.g3.4xlarge, ml.g3.8xlarge, ml.g3.16xlarge, ml.g4dn.xlarge, ml.g4dn.2xlarge, ml.g4dn.4xlarge, ml.g4dn.8xlarge, ml.g4dn.16xlarge, ml.g4dn.12xlarge, ml.g4dn.metal, ml.g4ad.xlarge, ml.g4ad.2xlarge, ml.g4ad.4xlarge, ml.g4ad.8xlarge, ml.g4ad.16xlarge, ml.g5.xlarge, ml.g5.2xlarge, ml.g5.4xlarge, ml.g5.8xlarge, ml.g5.16xlarge, ml.g5.12xlarge, ml.g5.24xlarge, ml.g5.48xlarge, ml.g5g.xlarge, ml.g5g.2xlarge, ml.g5g.4xlarge, ml.g5g.8xlarge, ml.g5g.16xlarge, ml.g5g.metal] @@ -60,14 +62,14 @@ containerLogs: Regex ^(?