diff --git a/cdk.json b/cdk.json index dc36022b..e9672526 100644 --- a/cdk.json +++ b/cdk.json @@ -32,22 +32,21 @@ "GRAFANA_NSWRKLDS_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/namespace-workloads.json", "GRAFANA_NODEEXP_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/nodeexporter-nodes.json", "GRAFANA_NODES_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/nodes.json", - "GRAFANA_WORKLOADS_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/workloads.json" + "GRAFANA_WORKLOADS_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/workloads.json", + "GRAFANA_ISTIO_CP_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/v0.2.0/artifacts/grafana-dashboards/eks/istio/istio-control-plane-dashboard.json", + "GRAFANA_ISTIO_MESH_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/v0.2.0/artifacts/grafana-dashboards/eks/istio/istio-mesh-dashboard.json", + "GRAFANA_ISTIO_PERF_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/v0.2.0/artifacts/grafana-dashboards/eks/istio/istio-performance-dashboard.json", + "GRAFANA_ISTIO_SERVICE_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/v0.2.0/artifacts/grafana-dashboards/eks/istio/istio-service-dashboard.json" }, "kustomizations": [ { "kustomizationPath": "./artifacts/grafana-operator-manifests/eks/infrastructure" + }, + { + "kustomizationPath": "./artifacts/grafana-operator-manifests/eks/istio" } ] }, - "gpuNodeGroup": { - "instanceType": "g4dn.xlarge", - "desiredSize": 2, - "minSize": 2, - "maxSize": 3, - "ebsSize": 50 - }, - "existing.cluster.name": "single-new-eks-observability-accelerator", - "existing.kubectl.rolename": "YOUR_KUBECTL_ROLE" + "istio.pattern.enabled": true } } \ No newline at end of file diff --git a/lib/common/resources/amp-config/istio/alerting-rules.yml b/lib/common/resources/amp-config/istio/alerting-rules.yml new file mode 100644 index 00000000..ef9f7fcd --- /dev/null +++ b/lib/common/resources/amp-config/istio/alerting-rules.yml @@ -0,0 +1,113 @@ + groups: + - name: "istio.basic.alerting-rules" + rules: + - alert: IngressTrafficMissing + annotations: + summary: 'ingress gateway traffic missing' + description: '[Critical]: ingress gateway traffic missing, likely other monitors are misleading, check client logs' + expr: > + absent(istio_requests_total{destination_service_namespace=~"service-graph.*",reporter="source",source_workload="istio-ingressgateway"})==1 + for: 5m + - alert: IstioMetricsMissing + annotations: + summary: 'Istio Metrics missing' + description: '[Critical]: Check prometheus deployment or whether the prometheus filters are applied correctly' + expr: > + absent(istio_request_total)==1 or absent(istio_request_duration_milliseconds_bucket)==1 + for: 5m + - name: "istio.workload.alerting-rules" + rules: + - alert: HTTP5xxRateHigh + annotations: + summary: '5xx rate too high' + description: 'The HTTP 5xx errors rate higher than 0.05 in 5 mins' + expr: > + sum(irate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(irate(istio_requests_total{reporter="destination"}[5m])) > 0.05 + for: 5m + - alert: WorkloadLatencyP99High + expr: histogram_quantile(0.99, sum(irate(istio_request_duration_milliseconds_bucket{source_workload=~"svc.*"}[5m])) by (source_workload,namespace, le)) > 160 + for: 10m + annotations: + description: 'The workload request latency P99 > 160ms ' + message: "Request duration has slowed down for workload: {{`{{$labels.source_workload}}`}} in namespace: {{`{{$labels.namespace}}`}}. Response duration is {{`{{$value}}`}} milliseconds" + - alert: IngressLatencyP99High + expr: histogram_quantile(0.99, sum(irate(istio_request_duration_milliseconds_bucket{source_workload=~"istio.*"}[5m])) by (source_workload,namespace, le)) > 250 + for: 10m + annotations: + description: 'The ingress latency P99 > 250ms ' + message: "Request duration has slowed down for ingress: {{`{{$labels.source_workload}}`}} in namespace: {{`{{$labels.namespace}}`}}. Response duration is {{`{{$value}}`}} milliseconds" + - name: "istio.infra.alerting-rules" + rules: + - alert: ProxyContainerCPUUsageHigh + expr: (sum(rate(container_cpu_usage_seconds_total{namespace!="kube-system", container=~"istio-proxy", namespace!=""}[5m])) BY (namespace, pod, container) * 100) > 80 + for: 5m + annotations: + summary: "Proxy Container CPU usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n" + description: "Proxy Container CPU usage is above 80%" + - alert: ProxyContainerMemoryUsageHigh + expr: (sum(container_memory_working_set_bytes{namespace!="kube-system", container=~"istio-proxy", namespace!=""}) BY (container, pod, namespace) / (sum(container_spec_memory_limit_bytes{namespace!="kube-system", container!="POD"}) BY (container, pod, namespace) > 0)* 100) > 80 + for: 5m + annotations: + summary: "Proxy Container Memory usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n" + description: "Proxy Container Memory usage is above 80%" + - alert: IngressMemoryUsageIncreaseRateHigh + expr: avg(deriv(container_memory_working_set_bytes{container=~"istio-proxy",namespace="istio-system"}[60m])) > 200 + for: 180m + annotations: + summary: "Ingress proxy Memory change rate, VALUE = {{ $value }}\n" + description: "Ingress proxy Memory Usage increases more than 200 Bytes/sec" + - alert: IstiodContainerCPUUsageHigh + expr: (sum(rate(container_cpu_usage_seconds_total{namespace="istio-system", container="discovery"}[5m])) BY (pod) * 100) > 80 + for: 5m + annotations: + summary: "Istiod Container CPU usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n" + description: "Isitod Container CPU usage is above 80%" + - alert: IstiodMemoryUsageHigh + expr: (sum(container_memory_working_set_bytes{namespace="istio-system", container="discovery"}) BY (pod) / (sum(container_spec_memory_limit_bytes{namespace="istio-system", container="discovery"}) BY (pod) > 0)* 100) > 80 + for: 5m + annotations: + summary: "Istiod Container Memory usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n" + description: "Istiod Container Memory usage is above 80%" + - alert: IstiodMemoryUsageIncreaseRateHigh + expr: sum(deriv(container_memory_working_set_bytes{namespace="istio-system",pod=~"istiod-.*"}[60m])) > 1000 + for: 300m + annotations: + summary: "Istiod Container Memory usage increase rate high, VALUE = {{ $value }}\n" + description: "Istiod Container Memory usage increases more than 1k Bytes/sec" + - name: "istio.controlplane.alerting-rules" + rules: + - alert: IstiodxdsPushErrorsHigh + annotations: + summary: 'istiod push errors is too high' + description: 'istiod push error rate is higher than 0.05' + expr: > + sum(irate(pilot_xds_push_errors{app="istiod"}[5m])) / sum(irate(pilot_xds_pushes{app="istiod"}[5m])) > 0.05 + for: 5m + - alert: IstiodxdsRejectHigh + annotations: + summary: 'istiod rejects rate is too high' + description: 'istiod rejects rate is higher than 0.05' + expr: > + sum(irate(pilot_total_xds_rejects{app="istiod"}[5m])) / sum(irate(pilot_xds_pushes{app="istiod"}[5m])) > 0.05 + for: 5m + - alert: IstiodContainerNotReady + annotations: + summary: 'istiod container not ready' + description: 'container: discovery not running' + expr: > + kube_pod_container_status_running{namespace="istio-system", container="discovery", component=""} == 0 + for: 5m + - alert: IstiodUnavailableReplica + annotations: + summary: 'Istiod unavailable pod' + description: 'Istiod unavailable replica > 0' + expr: > + kube_deployment_status_replicas_unavailable{deployment="istiod", component=""} > 0 + for: 5m + - alert: Ingress200RateLow + annotations: + summary: 'ingress gateway 200 rate drops' + description: 'The expected rate is 100 per ns, the limit is set based on 15ns' + expr: > + sum(rate(istio_requests_total{reporter="source", source_workload="istio-ingressgateway",response_code="200",destination_service_namespace=~"service-graph.*"}[5m])) < 1490 + for: 30m \ No newline at end of file diff --git a/lib/common/resources/amp-config/istio/recording-rules.yml b/lib/common/resources/amp-config/istio/recording-rules.yml new file mode 100644 index 00000000..c2908934 --- /dev/null +++ b/lib/common/resources/amp-config/istio/recording-rules.yml @@ -0,0 +1,59 @@ + groups: + - name: "istio.recording-rules" + interval: 5s + rules: + - record: "workload:istio_requests_total" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_requests_total) + + - record: "workload:istio_request_duration_milliseconds_count" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_count) + + - record: "workload:istio_request_duration_milliseconds_sum" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_sum) + + - record: "workload:istio_request_duration_milliseconds_bucket" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_bucket) + + - record: "workload:istio_request_bytes_count" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_count) + + - record: "workload:istio_request_bytes_sum" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_sum) + + - record: "workload:istio_request_bytes_bucket" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_bucket) + + - record: "workload:istio_response_bytes_count" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_count) + + - record: "workload:istio_response_bytes_sum" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_sum) + + - record: "workload:istio_response_bytes_bucket" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_bucket) + + - record: "workload:istio_tcp_sent_bytes_total" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_sent_bytes_total) + + - record: "workload:istio_tcp_received_bytes_total" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_received_bytes_total) + + - record: "workload:istio_tcp_connections_opened_total" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_connections_opened_total) + + - record: "workload:istio_tcp_connections_closed_total" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_connections_closed_total) \ No newline at end of file diff --git a/lib/single-new-eks-opensource-observability-pattern/index.ts b/lib/single-new-eks-opensource-observability-pattern/index.ts index f03ccc51..0fac6d1d 100644 --- a/lib/single-new-eks-opensource-observability-pattern/index.ts +++ b/lib/single-new-eks-opensource-observability-pattern/index.ts @@ -2,6 +2,8 @@ import { Construct } from 'constructs'; import { utils } from '@aws-quickstart/eks-blueprints'; import * as blueprints from '@aws-quickstart/eks-blueprints'; import { GrafanaOperatorSecretAddon } from './grafanaoperatorsecretaddon'; +import * as eks from 'aws-cdk-lib/aws-eks'; +import * as ec2 from 'aws-cdk-lib/aws-ec2'; import * as amp from 'aws-cdk-lib/aws-aps'; import { ObservabilityBuilder } from '@aws-quickstart/eks-blueprints'; import * as fs from 'fs'; @@ -97,6 +99,20 @@ export default class SingleNewEksOpenSourceobservabilityPattern { ); } + if (utils.valueFromContext(scope, "istio.pattern.enabled", false)) { + ampAddOnProps.openTelemetryCollector = { + manifestPath: __dirname + '/../common/resources/otel-collector-config-new.yml', + manifestParameterMap: { + javaScrapeSampleLimit: 1000, + javaPrometheusMetricsEndpoint: "/metrics" + } + }; + ampAddOnProps.ampRules?.ruleFilePaths.push( + __dirname + '/../common/resources/amp-config/istio/alerting-rules.yml', + __dirname + '/../common/resources/amp-config/istio/recording-rules.yml' + ); + } + Reflect.defineMetadata("ordered", true, blueprints.addons.GrafanaOperatorAddon); const addOns: Array = [ new blueprints.addons.CloudWatchLogsAddon({ @@ -108,9 +124,28 @@ export default class SingleNewEksOpenSourceobservabilityPattern { new GrafanaOperatorSecretAddon() ]; + if (utils.valueFromContext(scope, "istio.pattern.enabled", false)) { + const istioControlPlaneAddOnProps = { + version: "1.18.2", + } + addOns.push(new blueprints.addons.IstioBaseAddOn({ + version: "1.18.2" + })); + addOns.push(new blueprints.addons.IstioControlPlaneAddOn(istioControlPlaneAddOnProps)); + } + + const mngProps: blueprints.MngClusterProviderProps = { + version: eks.KubernetesVersion.of("1.28"), + instanceTypes: [new ec2.InstanceType("m5.2xlarge")], + amiType: eks.NodegroupAmiType.AL2_X86_64, + desiredSize: 2, + maxSize: 3, + }; + ObservabilityBuilder.builder() .account(account) .region(region) + .clusterProvider(new blueprints.MngClusterProvider(mngProps)) .resourceProvider(ampWorkspaceName, new blueprints.CreateAmpProvider(ampWorkspaceName, ampWorkspaceName)) .version('auto') .withAmpProps(ampAddOnProps)