Skip to content

Commit

Permalink
EKS Addon Fargate Bug Fix (#58)
Browse files Browse the repository at this point in the history
  • Loading branch information
Paramadon authored Jul 3, 2024
1 parent e0e99c7 commit 48781f5
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ spec:
nodeSelector:
kubernetes.io/os: linux
serviceAccount: {{ template "cloudwatch-agent.serviceAccountName" . }}
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: {{ .Values.fargateLabelKey }}
operator: NotIn
values:
- fargate
{{- if .Values.agent.config }}
config: {{ include "cloudwatch-agent.modify-config" (merge (dict "Config" .Values.agent.config) . ) }}
{{- else }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,14 @@ spec:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: {{ .Values.nodeLabelKey }}
operator: In
values: {{ .Values.gpuInstances | toYaml | nindent 16 }}
- matchExpressions:
- key: {{ .Values.nodeLabelKey }}
operator: In
values: {{ .Values.gpuInstances | toYaml | nindent 16 }}
- key: {{ .Values.fargateLabelKey }}
operator: NotIn
values:
- fargate
resources:
requests:
cpu: 250m
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,15 @@ spec:
hostPath:
path: /var/log/dmesg
serviceAccountName: {{ template "cloudwatch-agent.serviceAccountName" . }}
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: {{ .Values.fargateLabelKey }}
operator: NotIn
values:
- fargate
nodeSelector:
kubernetes.io/os: linux
{{- with .Values.tolerations }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,17 @@ spec:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/os
operator: In
values:
- linux
- key: {{ .Values.nodeLabelKey }}
operator: In
values: {{ .Values.neuronInstances | toYaml | nindent 20 }}
- key: kubernetes.io/os
operator: In
values:
- linux
- key: {{ .Values.nodeLabelKey }}
operator: In
values: {{ .Values.neuronInstances | toYaml | nindent 20 }}
- key: {{ .Values.fargateLabelKey }}
operator: NotIn
values:
- fargate
resources:
limits:
cpu: 500m
Expand All @@ -41,7 +45,7 @@ spec:
- name: "metrics"
port: {{ .Values.neuronMonitor.service.port }}
command:
- "/opt/bin/entrypoint.sh"
- "/opt/bin/entrypoint.sh"
args:
port: "{{ .Values.neuronMonitor.service.port }}"
cert-file: "/etc/amazon-cloudwatch-observability-neuron-cert/server.crt"
Expand Down
44 changes: 23 additions & 21 deletions charts/amazon-cloudwatch-observability/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ clusterName:
region:

nodeLabelKey: node.kubernetes.io/instance-type
fargateLabelKey: eks.amazonaws.com/compute-type

## NVIDIA GPU instance types
gpuInstances: [ p2.xlarge, p2.8xlarge, p2.16xlarge, p3.2xlarge, p3.8xlarge, p3.16xlarge, p3dn.24xlarge, p4d.24xlarge, p4de.24xlarge, p5.48xlarge, g3s.xlarge, g3.4xlarge, g3.8xlarge, g3.16xlarge, g4dn.xlarge, g4dn.2xlarge, g4dn.4xlarge, g4dn.8xlarge, g4dn.16xlarge, g4dn.12xlarge, g4dn.metal, g4ad.xlarge, g4ad.2xlarge, g4ad.4xlarge, g4ad.8xlarge, g4ad.16xlarge, g5.xlarge, g5.2xlarge, g5.4xlarge, g5.8xlarge, g5.16xlarge, g5.12xlarge, g5.24xlarge, g5.48xlarge, g5g.xlarge, g5g.2xlarge, g5g.4xlarge, g5g.8xlarge, g5g.16xlarge, g5g.metal, ml.p5.48xlarge, ml.p4d.24xlarge, ml.p4de.24xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.p3dn.24xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.g3s.xlarge, ml.g3.4xlarge, ml.g3.8xlarge, ml.g3.16xlarge, ml.g4dn.xlarge, ml.g4dn.2xlarge, ml.g4dn.4xlarge, ml.g4dn.8xlarge, ml.g4dn.16xlarge, ml.g4dn.12xlarge, ml.g4dn.metal, ml.g4ad.xlarge, ml.g4ad.2xlarge, ml.g4ad.4xlarge, ml.g4ad.8xlarge, ml.g4ad.16xlarge, ml.g5.xlarge, ml.g5.2xlarge, ml.g5.4xlarge, ml.g5.8xlarge, ml.g5.16xlarge, ml.g5.12xlarge, ml.g5.24xlarge, ml.g5.48xlarge, ml.g5g.xlarge, ml.g5g.2xlarge, ml.g5g.4xlarge, ml.g5g.8xlarge, ml.g5g.16xlarge, ml.g5g.metal]

Expand Down Expand Up @@ -60,14 +62,14 @@ containerLogs:
Regex ^(?<time>[^ ]* {1,2}[^ ]* [^ ]*) (?<host>[^ ]*) (?<ident>[a-zA-Z0-9_\/\.\-]*)(?:\[(?<pid>[0-9]+)\])?(?:[^\:]*\:)? *(?<message>.*)$
Time_Key time
Time_Format %b %d %H:%M:%S
[PARSER]
Name container_firstline
Format regex
Regex (?<log>(?<="log":")\S(?!\.).*?)(?<!\\)".*(?<stream>(?<="stream":").*?)".*(?<time>\d{4}-\d{1,2}-\d{1,2}T\d{2}:\d{2}:\d{2}\.\w*).*(?=})
Time_Key time
Time_Format %Y-%m-%dT%H:%M:%S.%LZ

[PARSER]
Name cwagent_firstline
Format regex
Expand All @@ -89,7 +91,7 @@ containerLogs:
Rotate_Wait 30
storage.type filesystem
Read_from_Head ${READ_FROM_HEAD}
[INPUT]
Name tail
Tag application.*
Expand All @@ -100,7 +102,7 @@ containerLogs:
Skip_Long_Lines On
Refresh_Interval 10
Read_from_Head ${READ_FROM_HEAD}

[INPUT]
Name tail
Tag application.*
Expand All @@ -111,7 +113,7 @@ containerLogs:
Skip_Long_Lines On
Refresh_Interval 10
Read_from_Head ${READ_FROM_HEAD}

[FILTER]
Name kubernetes
Match application.*
Expand All @@ -126,7 +128,7 @@ containerLogs:
Use_Kubelet On
Kubelet_Port 10250
Buffer_Size 0

[OUTPUT]
Name cloudwatch_logs
Match application.*
Expand All @@ -145,7 +147,7 @@ containerLogs:
DB /var/fluent-bit/state/systemd.db
Path /var/log/journal
Read_From_Tail ${READ_FROM_TAIL}
[INPUT]
Name tail
Tag dataplane.tail.*
Expand All @@ -158,20 +160,20 @@ containerLogs:
Rotate_Wait 30
storage.type filesystem
Read_from_Head ${READ_FROM_HEAD}

[FILTER]
Name modify
Match dataplane.systemd.*
Rename _HOSTNAME hostname
Rename _SYSTEMD_UNIT systemd_unit
Rename MESSAGE message
Remove_regex ^((?!hostname|systemd_unit|message).)*$

[FILTER]
Name aws
Match dataplane.*
imds_version v2

[OUTPUT]
Name cloudwatch_logs
Match dataplane.*
Expand Down Expand Up @@ -235,7 +237,7 @@ containerLogs:
Daemon off
net.dns.resolver LEGACY
Parsers_File parsers.conf
@INCLUDE application-log.conf
@INCLUDE dataplane-log.conf
@INCLUDE host-log.conf
Expand Down Expand Up @@ -273,7 +275,7 @@ containerLogs:
Rotate_Wait 30
Refresh_Interval 10
Read_from_Head ${READ_FROM_HEAD}
[INPUT]
Name tail
Tag application.*
Expand All @@ -285,7 +287,7 @@ containerLogs:
Rotate_Wait 30
Refresh_Interval 10
Read_from_Head ${READ_FROM_HEAD}

[INPUT]
Name tail
Tag application.*
Expand All @@ -297,7 +299,7 @@ containerLogs:
Rotate_Wait 30
Refresh_Interval 10
Read_from_Head ${READ_FROM_HEAD}

[OUTPUT]
Name cloudwatch_logs
Match application.*
Expand All @@ -318,7 +320,7 @@ containerLogs:
Rotate_Wait 30
Refresh_Interval 10
Read_from_Head ${READ_FROM_HEAD}
[INPUT]
Name tail
Tag dataplane.tail.C.ProgramData.Amazon.EKS.logs.vpc-bridge
Expand All @@ -337,7 +339,7 @@ containerLogs:
Channels EKS
DB C:\\var\\fluent-bit\\state\\flb_eks_winlog.db
Interval_Sec 60

[FILTER]
Name aws
Match dataplane.*
Expand All @@ -347,7 +349,7 @@ containerLogs:
Name aws
Match winlog.*
imds_version v2

[OUTPUT]
Name cloudwatch_logs
Match dataplane.*
Expand All @@ -371,12 +373,12 @@ containerLogs:
Channels System
DB C:\\var\\fluent-bit\\state\\flb_system_winlog.db
Interval_Sec 60
[FILTER]
Name aws
Match winlog.*
imds_version v2

[OUTPUT]
Name cloudwatch_logs
Match winlog.*
Expand Down Expand Up @@ -514,7 +516,7 @@ agent:
## TLS Certificate Option 2: Use certManager to generate self-signed certificate.
## certManager must be enabled. If enabled, it takes precedence over option 1.
certManager:
enabled: false
enabled: false
## Provide the issuer kind and name to do the cert auth job.
## By default, OpenTelemetry Operator will use self-signer issuer.
issuerRef: { }
Expand Down Expand Up @@ -565,7 +567,7 @@ dcgmExporter:
us-gov-east-1: 743662458514.dkr.ecr.us-gov-east-1.amazonaws.com
us-gov-west-1: 743662458514.dkr.ecr.us-gov-west-1.amazonaws.com
configmap: dcgm-exporter-config-map
arguments: ["--web-config-file=/etc/dcgm-exporter/web-config.yaml"]
arguments: ["--web-config-file=/etc/dcgm-exporter/web-config.yaml" ]
service:
enable: true
type: ClusterIP
Expand Down

0 comments on commit 48781f5

Please sign in to comment.