Skip to content

Commit

Permalink
Added default tolerations. (#41)
Browse files Browse the repository at this point in the history
  • Loading branch information
musa-asad authored May 22, 2024
1 parent cc910d3 commit 32e8402
Show file tree
Hide file tree
Showing 10 changed files with 111 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1293,6 +1293,48 @@ spec:
description: TlsConfig is the raw YAML to be used as the exporter
TLS configuration.
type: string
tolerations:
description: Toleration to schedule DCGM Exporter pods.
This is only relevant to daemonset, statefulset, and deployment
mode
items:
description: The pod this Toleration is attached to tolerates any
taint that matches the triple <key,value,effect> using the matching
operator <operator>.
properties:
effect:
description: Effect indicates the taint effect to match. Empty
means match all taint effects. When specified, allowed values
are NoSchedule, PreferNoSchedule and NoExecute.
type: string
key:
description: Key is the taint key that the toleration applies
to. Empty means match all taint keys. If the key is empty,
operator must be Exists; this combination means to match all
values and all keys.
type: string
operator:
description: Operator represents a key's relationship to the
value. Valid operators are Exists and Equal. Defaults to Equal.
Exists is equivalent to wildcard for value, so that a pod
can tolerate all taints of a particular category.
type: string
tolerationSeconds:
description: TolerationSeconds represents the period of time
the toleration (which must be of effect NoExecute, otherwise
this field is ignored) tolerates the taint. By default, it
is not set, which means tolerate the taint forever (do not
evict). Zero and negative values will be treated as 0 (evict
immediately) by the system.
format: int64
type: integer
value:
description: Value is the taint value the toleration matches
to. If the operator is Exists, the value should be empty,
otherwise just a regular string.
type: string
type: object
type: array
volumeMounts:
description: VolumeMounts represents the mount points to use in the
underlying collector deployment(s)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1174,6 +1174,48 @@ spec:
monitorConfig:
description: MonitorConfig is the raw Json to be used as monitor configuration.
type: string
tolerations:
description: Toleration to schedule Neuron Monitor Exporter pods.
This is only relevant to daemonset, statefulset, and deployment
mode
items:
description: The pod this Toleration is attached to tolerates any
taint that matches the triple <key,value,effect> using the matching
operator <operator>.
properties:
effect:
description: Effect indicates the taint effect to match. Empty
means match all taint effects. When specified, allowed values
are NoSchedule, PreferNoSchedule and NoExecute.
type: string
key:
description: Key is the taint key that the toleration applies
to. Empty means match all taint keys. If the key is empty,
operator must be Exists; this combination means to match all
values and all keys.
type: string
operator:
description: Operator represents a key's relationship to the
value. Valid operators are Exists and Equal. Defaults to Equal.
Exists is equivalent to wildcard for value, so that a pod
can tolerate all taints of a particular category.
type: string
tolerationSeconds:
description: TolerationSeconds represents the period of time
the toleration (which must be of effect NoExecute, otherwise
this field is ignored) tolerates the taint. By default, it
is not set, which means tolerate the taint forever (do not
evict). Zero and negative values will be treated as 0 (evict
immediately) by the system.
format: int64
type: integer
value:
description: Value is the taint value the toleration matches
to. If the operator is Exists, the value should be empty,
otherwise just a regular string.
type: string
type: object
type: array
nodeSelector:
additionalProperties:
type: string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,4 +111,7 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
{{- with .Values.tolerations }}
tolerations: {{- toYaml . | nindent 2}}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,7 @@ spec:
tlsConfig: |
tls_server_config:
cert_file: /etc/amazon-cloudwatch-observability-dcgm-cert/server.crt
key_file: /etc/amazon-cloudwatch-observability-dcgm-cert/server.key
key_file: /etc/amazon-cloudwatch-observability-dcgm-cert/server.key
{{- with .Values.tolerations }}
tolerations: {{- toYaml . | nindent 2}}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,7 @@ spec:
serviceAccountName: {{ template "cloudwatch-agent.serviceAccountName" . }}
nodeSelector:
kubernetes.io/os: linux
{{- with .Values.tolerations }}
tolerations: {{- toYaml . | nindent 6}}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,7 @@ spec:
"type": "neuron_hw_counters"
}
]
}
}
{{- with .Values.tolerations }}
tolerations: {{- toYaml . | nindent 2}}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,6 @@ spec:
secretName: {{ template "amazon-cloudwatch-observability.certificateSecretName" . }}
nodeSelector:
kubernetes.io/os: linux
{{- with .Values.tolerations }}
tolerations: {{- toYaml . | nindent 6}}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,7 @@ spec:
value: "True"
- name: RUN_AS_HOST_PROCESS_CONTAINER
value: "True"
{{- with .Values.tolerations }}
tolerations: {{- toYaml . | nindent 2}}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,7 @@ spec:
terminationGracePeriodSeconds: 10
dnsPolicy: ClusterFirstWithHostNet
serviceAccountName: {{ template "cloudwatch-agent.serviceAccountName" . }}
{{- with .Values.tolerations }}
tolerations: {{- toYaml . | nindent 6}}
{{- end }}
{{- end }}
4 changes: 4 additions & 0 deletions charts/amazon-cloudwatch-observability/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ gpuInstances: [ p2.xlarge, p2.8xlarge, p2.16xlarge, p3.2xlarge, p3.8xlarge, p3.1
## Tranium/Infrentia instance types
neuronInstances: [ trn1.2xlarge, trn1.32xlarge, trn1n.32xlarge, inf1.xlarge, inf1.2xlarge, inf1.6xlarge, inf1.24xlarge, inf2.xlarge, inf2.8xlarge, inf2.24xlarge, inf2.48xlarge, ml.trn1.2xlarge, ml.trn1.32xlarge, ml.trn1n.32xlarge, ml.inf1.xlarge, ml.inf1.2xlarge, ml.inf1.6xlarge, ml.inf1.24xlarge, ml.inf2.xlarge, ml.inf2.8xlarge, ml.inf2.24xlarge, ml.inf2.48xlarge]

## Provide default tolerations
tolerations:
- operator: Exists

containerLogs:
enabled: true
fluentBit:
Expand Down

0 comments on commit 32e8402

Please sign in to comment.