Skip to content

Commit

Permalink
Support multiple AmazonCloudWatchAgent CRs and add integration for Pr…
Browse files Browse the repository at this point in the history
…ometheus Target Allocator (#126)
  • Loading branch information
okankoAMZ authored Dec 18, 2024
1 parent 159da8c commit 28fee3b
Show file tree
Hide file tree
Showing 9 changed files with 1,755 additions and 21 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
build
.tmp
*.iml
.DS_Store
*.DS_Store
.idea
.attach_pid*
bin

Large diffs are not rendered by default.

50 changes: 50 additions & 0 deletions charts/amazon-cloudwatch-observability/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,31 @@ Name for neuron-monitor
{{- default "neuron-monitor" .Values.neuronMonitor.name }}
{{- end }}

{{/*
Get the current recommended cloudwatch agent image for a region
*/}}
{{- define "cloudwatch-agent.modify-image" -}}
{{- $repository := .Values.agent.image.repository -}}
{{- $tag := .Values.agent.image.tag -}}

{{- if (.image) }}
{{- $imageCopy := deepCopy .image }}
{{- if hasKey $imageCopy "repository" }}
{{- $repository = $imageCopy.repository -}}
{{- end -}}
{{- if hasKey $imageCopy "tag" }}
{{- $tag = $imageCopy.tag -}}
{{- end -}}
{{- end -}}

{{- $imageDomain := "" -}}
{{- $imageDomain = index .Values.agent.image.repositoryDomainMap .Values.region -}}
{{- if not $imageDomain -}}
{{- $imageDomain = .Values.agent.image.repositoryDomainMap.public -}}
{{- end -}}
{{- printf "%s/%s:%s" $imageDomain $repository $tag -}}
{{- end -}}

{{/*
Get the current recommended cloudwatch agent image for a region
*/}}
Expand All @@ -108,6 +133,31 @@ Get the current recommended cloudwatch agent operator image for a region
{{- printf "%s/%s:%s" $imageDomain .Values.manager.image.repository .Values.manager.image.tag -}}
{{- end -}}

{{/*
Get the current recommended target allocator image for a region
*/}}
{{- define "target-allocator.modify-image" -}}
{{- $repository := .Values.agent.prometheus.targetAllocator.image.repository -}}
{{- $tag := .Values.agent.prometheus.targetAllocator.image.tag -}}

{{- if (.image) }}
{{- $imageCopy := deepCopy .image }}
{{- if hasKey $imageCopy "repository" }}
{{- $repository = $imageCopy.repository -}}
{{- end -}}
{{- if hasKey $imageCopy "tag" }}
{{- $tag = $imageCopy.tag -}}
{{- end -}}
{{- end -}}

{{- $imageDomain := "" -}}
{{- $imageDomain = index .Values.agent.prometheus.targetAllocator.image.repositoryDomainMap .Values.region -}}
{{- if not $imageDomain -}}
{{- $imageDomain = .Values.agent.prometheus.targetAllocator.image.repositoryDomainMap.public -}}
{{- end -}}
{{- printf "%s/%s:%s" $imageDomain $repository $tag -}}
{{- end -}}

{{/*
Get the current recommended fluent-bit image for a region
*/}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ spec:
{{- end }}
{{- end }}

{{- if ( .Values.agent.certManager.enabled) }}
{{- if ( .Values.agent.certManager.enabled) }}
---
apiVersion: cert-manager.io/v1
kind: Certificate
Expand All @@ -55,6 +55,9 @@ metadata:
namespace: {{ .Release.Namespace }}
spec:
dnsNames:
{{- range $i, $customAgent := .Values.agents }}
- {{( printf "%s-target-allocator-service" $customAgent.name )}}
{{- end }}
- "dcgm-exporter-service"
- "dcgm-exporter-service.amazon-cloudwatch.svc"
- "neuron-monitor-service"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
{{- if .Values.agent.enabled }}
{{- if and (.Values.agent.autoGenerateCert.enabled) (not .Values.agent.certManager.enabled) -}}
{{- $altNames := list ( printf "%s-service" (include "dcgm-exporter.name" .) ) ( printf "%s-service" (include "neuron-monitor.name" .) ) ( printf "%s-service.%s.svc" (include "dcgm-exporter.name" .) .Release.Namespace ) ( printf "%s-service.%s.svc" (include "neuron-monitor.name" .) .Release.Namespace ) -}}
{{- $agentAltNames := list ( printf "%s" (include "cloudwatch-agent.name" .) ) ( printf "%s.%s.svc" (include "cloudwatch-agent.name" .) .Release.Namespace ) -}}
{{- range $i, $customAgent := .Values.agents }}
{{ $altNames = append $altNames ( printf "%s-target-allocator-service" $customAgent.name )}}
{{- end }}
{{- $agentAltNames := list ( printf "%s" (include "cloudwatch-agent.name" .) ) ( printf "%s.%s.svc" (include "cloudwatch-agent.name" .) .Release.Namespace ) -}}
{{- $ca := genCA ("agent-ca") ( .Values.agent.autoGenerateCert.expiryDays | int ) -}}
{{- $cert := genSignedCert ("agent") nil $altNames ( .Values.admissionWebhooks.autoGenerateCert.expiryDays | int ) $ca -}}
{{- $serverCert := genSignedCert ("agent-server") nil $agentAltNames ( .Values.admissionWebhooks.autoGenerateCert.expiryDays | int ) $ca -}}
Expand All @@ -10,7 +13,7 @@ apiVersion: v1
kind: Secret
metadata:
labels:
{{- include "amazon-cloudwatch-observability.labels" . | nindent 4}}
{{- include "amazon-cloudwatch-observability.labels" . | nindent 4}}
name: "amazon-cloudwatch-observability-agent-cert"
namespace: {{ .Release.Namespace }}
data:
Expand Down Expand Up @@ -44,41 +47,67 @@ data:
---
{{- end -}}

{{- $clusterName := .Values.clusterName | required ".Values.clusterName is required." -}}
{{- $region := .Values.region | required ".Values.region is required." -}}
{{- range $i, $customAgent := .Values.agents }}
{{- $clusterName := $.Values.clusterName | required "$.Values.clusterName is required." -}}
{{- $region := $.Values.region | required "$.Values.region is required." -}}
apiVersion: cloudwatch.aws.amazon.com/v1alpha1
kind: AmazonCloudWatchAgent
metadata:
name: {{ template "cloudwatch-agent.name" . }}
namespace: {{ .Release.Namespace }}
name: {{ $customAgent.name | default (include "cloudwatch-agent.name" $) }}
namespace: {{ $.Release.Namespace }}
spec:
image: {{ template "cloudwatch-agent.image" . }}
mode: daemonset
image: {{ include "cloudwatch-agent.modify-image" (merge (dict "image" $customAgent.image) $ ) }}
mode: {{ $customAgent.mode | default "daemonset" }}
replicas: {{ $customAgent.replicas | default 1 }}
nodeSelector:
kubernetes.io/os: linux
serviceAccount: {{ template "cloudwatch-agent.serviceAccountName" . }}
priorityClassName: {{ .Values.agent.priorityClassName }}
serviceAccount: {{ if hasKey ($customAgent.serviceAccount) "name" }}{{ $customAgent.serviceAccount.Name }}{{ else }}{{ (include "cloudwatch-agent.serviceAccountName" $) }}{{ end }}
priorityClassName: {{ $customAgent.priorityClassName | default $.Values.agent.priorityClassName }}
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: {{ .Values.fargateLabelKey }}
- key: {{ $.Values.fargateLabelKey }}
operator: NotIn
values:
- fargate
hostNetwork: true
{{- if .Values.agent.config }}
config: {{ include "cloudwatch-agent.modify-config" (merge (dict "Config" .Values.agent.config) . ) }}
{{- if $customAgent.config }}
config: {{ include "cloudwatch-agent.modify-config" (merge (dict "Config" $customAgent.config) $ ) }}
{{- else }}
config: {{ include "cloudwatch-agent.modify-config" (merge (dict "Config" .Values.agent.defaultConfig) . ) }}
config: {{ include "cloudwatch-agent.modify-config" (merge (dict "Config" $.Values.agent.defaultConfig) $ ) }}
{{- end }}
{{- if $customAgent.otelConfig }}
otelConfig: {{ include "cloudwatch-agent.modify-yaml-config" (merge (dict "OtelConfig" $customAgent.otelConfig) . ) }}
{{ else if $.Values.agent.otelConfig }}
otelConfig: {{ include "cloudwatch-agent.modify-yaml-config" (merge (dict "OtelConfig" $.Values.agent.otelConfig) . ) }}
{{- end }}
{{- if .Values.agent.otelConfig }}
otelConfig: {{ include "cloudwatch-agent.modify-yaml-config" (merge (dict "OtelConfig" .Values.agent.otelConfig) . ) }}
{{- if $customAgent.prometheus }}
{{- if $customAgent.prometheus.config }}
prometheus:
{{- with $customAgent.prometheus.config }}
config:
{{- toYaml . | nindent 6 }}
{{- end }}
{{- end }}
{{- with .Values.agent.resources }}
resources: {{- toYaml . | nindent 4}}
{{- if $customAgent.prometheus.targetAllocator }}
targetAllocator:
enabled: {{ $customAgent.prometheus.targetAllocator.enabled | default false }}
image: {{ include "target-allocator.modify-image" (merge (dict "image" $customAgent.prometheus.targetAllocator.image) $ ) }}
allocationStrategy: "consistent-hashing"
{{- if $customAgent.prometheus.targetAllocator.prometheusCR }}
prometheusCR: {{ $customAgent.prometheus.targetAllocator.prometheusCR.enabled | default false }}
{{- end }}
{{- end }}
{{- end }}
resources:
requests:
memory: {{ if and (hasKey ($customAgent.resources) "requests") (hasKey ($customAgent.resources.requests) "memory") }}{{ $customAgent.resources.requests.memory }}{{ else }}{{ $.Values.agent.resources.requests.memory }}{{ end }}
cpu: {{ if and (hasKey ($customAgent.resources) "requests") (hasKey ($customAgent.resources.requests) "cpu") }}{{ $customAgent.resources.requests.cpu }}{{ else }}{{ $.Values.agent.resources.requests.cpu }}{{ end }}
limits:
memory: {{ if and (hasKey ($customAgent.resources) "limits") (hasKey ($customAgent.resources.limits) "memory") }}{{ $customAgent.resources.limits.memory }}{{ else }}{{ $.Values.agent.resources.limits.memory }}{{ end }}
cpu: {{ if and (hasKey ($customAgent.resources) "limits") (hasKey ($customAgent.resources.limits) "cpu") }}{{ $customAgent.resources.limits.cpu }}{{ else }}{{ $.Values.agent.resources.limits.cpu }}{{ end }}
volumeMounts:
- mountPath: /rootfs
name: rootfs
Expand Down Expand Up @@ -168,7 +197,9 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
{{- with .Values.tolerations }}
{{- with $.Values.tolerations }}
tolerations: {{- toYaml . | nindent 2}}
{{- end }}
---
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,6 @@ rules:
- apiGroups: [ "route.openshift.io" ]
resources: [ "routes", "routes/custom-host" ]
verbs: [ "create","delete","get","list","patch","update","watch" ]
- apiGroups: [ "policy" ]
resources: [ "poddisruptionbudgets" ]
verbs: [ "create","delete","get","list","patch","update","watch" ]
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{{- if .Values.agent.enabled }}
{{- range $i, $customAgent := .Values.agents }}
{{- if and (and (hasKey ($customAgent.prometheus) "targetAllocator") (hasKey ($customAgent.prometheus.targetAllocator) "enabled")) $customAgent.prometheus.targetAllocator.enabled }}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
{{- include "amazon-cloudwatch-observability.labels" $ | nindent 4}}
name: "cloudwatch-agent-target-allocator-role"
rules:
- apiGroups: [ "" ]
resources: [ "pods", "nodes", "nodes/metrics", "services", "endpoints" ]
verbs: [ "list", "watch", "get" ]
- apiGroups: [ "" ]
resources: [ "configmaps" ]
verbs: [ "get" ]
- apiGroups: ["discovery.k8s.io"]
resources: ["endpointslices"]
verbs: ["get", "list", "watch"]
- apiGroups: [ "networking.k8s.io"]
resources: ["ingresses"]
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
{{- if and (hasKey ($customAgent.prometheus.targetAllocator) "PrometheusCR") $customAgent.prometheus.targetAllocator.PrometheusCR.enabled }}
- apiGroups: [ "monitoring.coreos.com"]
resources: ["podmonitors", "servicemonitors"]
verbs: ["get", "list", "watch"]
{{- end }}
{{- end }}
---
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{{- if .Values.agent.enabled }}
{{- range $i, $customAgent := .Values.agents }}
{{- if and (and (hasKey ($customAgent.prometheus) "targetAllocator") (hasKey ($customAgent.prometheus.targetAllocator) "enabled")) $customAgent.prometheus.targetAllocator.enabled }}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
{{- include "amazon-cloudwatch-observability.labels" $ | nindent 4 }}
name: "cloudwatch-agent-target-allocator-rolebinding"
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: "cloudwatch-agent-target-allocator-role"
subjects:
- kind: ServiceAccount
name: "target-allocator-service-acct"
namespace: {{ $.Release.Namespace }}
{{- end }}
---
{{- end }}
{{- end }}
22 changes: 22 additions & 0 deletions charts/amazon-cloudwatch-observability/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1311,8 +1311,15 @@ admissionWebhooks:
secretAnnotations: {}
## Secret labels
secretLabels: {}

## List of AmazonCloudWatchAgent workloads to install & manage, each representing an independent installation of the AmazonCloudWatchAgent CustomResource. Each entry in this list uses the schema & the defaults from $.agent, so only provide any overrides here.
agents:
- name: cloudwatch-agent

agent:
name:
mode: # represents the mode the cloudwatch-agent will run in (deployment, daemonset or statefulset)
replicas: # The total number non-terminated pods targeted by this cloudwatch-agent's deployment or statefulSet.
image:
repository: cloudwatch-agent
tag: 1.300050.0b956
Expand Down Expand Up @@ -1369,6 +1376,21 @@ agent:
}
}
}
prometheus:
config:
targetAllocator:
enabled: false
image:
repository: cloudwatch-agent-target-allocator
tag: 1.0.0
repositoryDomainMap:
public: public.ecr.aws/cloudwatch-agent
cn-north-1: 934860584483.dkr.ecr.cn-north-1.amazonaws.com.cn
cn-northwest-1: 934860584483.dkr.ecr.cn-northwest-1.amazonaws.com.cn
us-gov-east-1: 743662458514.dkr.ecr.us-gov-east-1.amazonaws.com
us-gov-west-1: 743662458514.dkr.ecr.us-gov-west-1.amazonaws.com
prometheusCR:
enabled: false

dcgmExporter:
name:
Expand Down

0 comments on commit 28fee3b

Please sign in to comment.