Skip to content

Commit

Permalink
Istio Monitoring Pattern
Browse files Browse the repository at this point in the history
  • Loading branch information
elamaran11 committed Jan 24, 2024
1 parent 70b4baf commit 30b06de
Show file tree
Hide file tree
Showing 4 changed files with 216 additions and 10 deletions.
19 changes: 9 additions & 10 deletions cdk.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,22 +32,21 @@
"GRAFANA_NSWRKLDS_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/namespace-workloads.json",
"GRAFANA_NODEEXP_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/nodeexporter-nodes.json",
"GRAFANA_NODES_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/nodes.json",
"GRAFANA_WORKLOADS_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/workloads.json"
"GRAFANA_WORKLOADS_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/workloads.json",
"GRAFANA_ISTIO_CP_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/v0.2.0/artifacts/grafana-dashboards/eks/istio/istio-control-plane-dashboard.json",
"GRAFANA_ISTIO_MESH_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/v0.2.0/artifacts/grafana-dashboards/eks/istio/istio-mesh-dashboard.json",
"GRAFANA_ISTIO_PERF_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/v0.2.0/artifacts/grafana-dashboards/eks/istio/istio-performance-dashboard.json",
"GRAFANA_ISTIO_SERVICE_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/v0.2.0/artifacts/grafana-dashboards/eks/istio/istio-service-dashboard.json"
},
"kustomizations": [
{
"kustomizationPath": "./artifacts/grafana-operator-manifests/eks/infrastructure"
},
{
"kustomizationPath": "./artifacts/grafana-operator-manifests/eks/istio"
}
]
},
"gpuNodeGroup": {
"instanceType": "g4dn.xlarge",
"desiredSize": 2,
"minSize": 2,
"maxSize": 3,
"ebsSize": 50
},
"existing.cluster.name": "single-new-eks-observability-accelerator",
"existing.kubectl.rolename": "YOUR_KUBECTL_ROLE"
"istio.pattern.enabled": true
}
}
113 changes: 113 additions & 0 deletions lib/common/resources/amp-config/istio/alerting-rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
groups:
- name: "istio.basic.alerting-rules"
rules:
- alert: IngressTrafficMissing
annotations:
summary: 'ingress gateway traffic missing'
description: '[Critical]: ingress gateway traffic missing, likely other monitors are misleading, check client logs'
expr: >
absent(istio_requests_total{destination_service_namespace=~"service-graph.*",reporter="source",source_workload="istio-ingressgateway"})==1
for: 5m
- alert: IstioMetricsMissing
annotations:
summary: 'Istio Metrics missing'
description: '[Critical]: Check prometheus deployment or whether the prometheus filters are applied correctly'
expr: >
absent(istio_request_total)==1 or absent(istio_request_duration_milliseconds_bucket)==1
for: 5m
- name: "istio.workload.alerting-rules"
rules:
- alert: HTTP5xxRateHigh
annotations:
summary: '5xx rate too high'
description: 'The HTTP 5xx errors rate higher than 0.05 in 5 mins'
expr: >
sum(irate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(irate(istio_requests_total{reporter="destination"}[5m])) > 0.05
for: 5m
- alert: WorkloadLatencyP99High
expr: histogram_quantile(0.99, sum(irate(istio_request_duration_milliseconds_bucket{source_workload=~"svc.*"}[5m])) by (source_workload,namespace, le)) > 160
for: 10m
annotations:
description: 'The workload request latency P99 > 160ms '
message: "Request duration has slowed down for workload: {{`{{$labels.source_workload}}`}} in namespace: {{`{{$labels.namespace}}`}}. Response duration is {{`{{$value}}`}} milliseconds"
- alert: IngressLatencyP99High
expr: histogram_quantile(0.99, sum(irate(istio_request_duration_milliseconds_bucket{source_workload=~"istio.*"}[5m])) by (source_workload,namespace, le)) > 250
for: 10m
annotations:
description: 'The ingress latency P99 > 250ms '
message: "Request duration has slowed down for ingress: {{`{{$labels.source_workload}}`}} in namespace: {{`{{$labels.namespace}}`}}. Response duration is {{`{{$value}}`}} milliseconds"
- name: "istio.infra.alerting-rules"
rules:
- alert: ProxyContainerCPUUsageHigh
expr: (sum(rate(container_cpu_usage_seconds_total{namespace!="kube-system", container=~"istio-proxy", namespace!=""}[5m])) BY (namespace, pod, container) * 100) > 80
for: 5m
annotations:
summary: "Proxy Container CPU usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n"
description: "Proxy Container CPU usage is above 80%"
- alert: ProxyContainerMemoryUsageHigh
expr: (sum(container_memory_working_set_bytes{namespace!="kube-system", container=~"istio-proxy", namespace!=""}) BY (container, pod, namespace) / (sum(container_spec_memory_limit_bytes{namespace!="kube-system", container!="POD"}) BY (container, pod, namespace) > 0)* 100) > 80
for: 5m
annotations:
summary: "Proxy Container Memory usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n"
description: "Proxy Container Memory usage is above 80%"
- alert: IngressMemoryUsageIncreaseRateHigh
expr: avg(deriv(container_memory_working_set_bytes{container=~"istio-proxy",namespace="istio-system"}[60m])) > 200
for: 180m
annotations:
summary: "Ingress proxy Memory change rate, VALUE = {{ $value }}\n"
description: "Ingress proxy Memory Usage increases more than 200 Bytes/sec"
- alert: IstiodContainerCPUUsageHigh
expr: (sum(rate(container_cpu_usage_seconds_total{namespace="istio-system", container="discovery"}[5m])) BY (pod) * 100) > 80
for: 5m
annotations:
summary: "Istiod Container CPU usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n"
description: "Isitod Container CPU usage is above 80%"
- alert: IstiodMemoryUsageHigh
expr: (sum(container_memory_working_set_bytes{namespace="istio-system", container="discovery"}) BY (pod) / (sum(container_spec_memory_limit_bytes{namespace="istio-system", container="discovery"}) BY (pod) > 0)* 100) > 80
for: 5m
annotations:
summary: "Istiod Container Memory usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n"
description: "Istiod Container Memory usage is above 80%"
- alert: IstiodMemoryUsageIncreaseRateHigh
expr: sum(deriv(container_memory_working_set_bytes{namespace="istio-system",pod=~"istiod-.*"}[60m])) > 1000
for: 300m
annotations:
summary: "Istiod Container Memory usage increase rate high, VALUE = {{ $value }}\n"
description: "Istiod Container Memory usage increases more than 1k Bytes/sec"
- name: "istio.controlplane.alerting-rules"
rules:
- alert: IstiodxdsPushErrorsHigh
annotations:
summary: 'istiod push errors is too high'
description: 'istiod push error rate is higher than 0.05'
expr: >
sum(irate(pilot_xds_push_errors{app="istiod"}[5m])) / sum(irate(pilot_xds_pushes{app="istiod"}[5m])) > 0.05
for: 5m
- alert: IstiodxdsRejectHigh
annotations:
summary: 'istiod rejects rate is too high'
description: 'istiod rejects rate is higher than 0.05'
expr: >
sum(irate(pilot_total_xds_rejects{app="istiod"}[5m])) / sum(irate(pilot_xds_pushes{app="istiod"}[5m])) > 0.05
for: 5m
- alert: IstiodContainerNotReady
annotations:
summary: 'istiod container not ready'
description: 'container: discovery not running'
expr: >
kube_pod_container_status_running{namespace="istio-system", container="discovery", component=""} == 0
for: 5m
- alert: IstiodUnavailableReplica
annotations:
summary: 'Istiod unavailable pod'
description: 'Istiod unavailable replica > 0'
expr: >
kube_deployment_status_replicas_unavailable{deployment="istiod", component=""} > 0
for: 5m
- alert: Ingress200RateLow
annotations:
summary: 'ingress gateway 200 rate drops'
description: 'The expected rate is 100 per ns, the limit is set based on 15ns'
expr: >
sum(rate(istio_requests_total{reporter="source", source_workload="istio-ingressgateway",response_code="200",destination_service_namespace=~"service-graph.*"}[5m])) < 1490
for: 30m
59 changes: 59 additions & 0 deletions lib/common/resources/amp-config/istio/recording-rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
groups:
- name: "istio.recording-rules"
interval: 5s
rules:
- record: "workload:istio_requests_total"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_requests_total)
- record: "workload:istio_request_duration_milliseconds_count"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_count)
- record: "workload:istio_request_duration_milliseconds_sum"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_sum)
- record: "workload:istio_request_duration_milliseconds_bucket"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_bucket)
- record: "workload:istio_request_bytes_count"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_count)
- record: "workload:istio_request_bytes_sum"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_sum)
- record: "workload:istio_request_bytes_bucket"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_bucket)
- record: "workload:istio_response_bytes_count"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_count)
- record: "workload:istio_response_bytes_sum"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_sum)
- record: "workload:istio_response_bytes_bucket"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_bucket)
- record: "workload:istio_tcp_sent_bytes_total"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_sent_bytes_total)
- record: "workload:istio_tcp_received_bytes_total"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_received_bytes_total)
- record: "workload:istio_tcp_connections_opened_total"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_connections_opened_total)
- record: "workload:istio_tcp_connections_closed_total"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_connections_closed_total)
35 changes: 35 additions & 0 deletions lib/single-new-eks-opensource-observability-pattern/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ import { Construct } from 'constructs';
import { utils } from '@aws-quickstart/eks-blueprints';
import * as blueprints from '@aws-quickstart/eks-blueprints';
import { GrafanaOperatorSecretAddon } from './grafanaoperatorsecretaddon';
import * as eks from 'aws-cdk-lib/aws-eks';
import * as ec2 from 'aws-cdk-lib/aws-ec2';
import * as amp from 'aws-cdk-lib/aws-aps';
import { ObservabilityBuilder } from '@aws-quickstart/eks-blueprints';
import * as fs from 'fs';
Expand Down Expand Up @@ -97,6 +99,20 @@ export default class SingleNewEksOpenSourceobservabilityPattern {
);
}

if (utils.valueFromContext(scope, "istio.pattern.enabled", false)) {
ampAddOnProps.openTelemetryCollector = {
manifestPath: __dirname + '/../common/resources/otel-collector-config-new.yml',
manifestParameterMap: {
javaScrapeSampleLimit: 1000,
javaPrometheusMetricsEndpoint: "/metrics"
}
};
ampAddOnProps.ampRules?.ruleFilePaths.push(
__dirname + '/../common/resources/amp-config/istio/alerting-rules.yml',
__dirname + '/../common/resources/amp-config/istio/recording-rules.yml'
);
}

Reflect.defineMetadata("ordered", true, blueprints.addons.GrafanaOperatorAddon);
const addOns: Array<blueprints.ClusterAddOn> = [
new blueprints.addons.CloudWatchLogsAddon({
Expand All @@ -108,9 +124,28 @@ export default class SingleNewEksOpenSourceobservabilityPattern {
new GrafanaOperatorSecretAddon()
];

if (utils.valueFromContext(scope, "istio.pattern.enabled", false)) {
const istioControlPlaneAddOnProps = {
version: "1.18.2",
}

Check failure on line 130 in lib/single-new-eks-opensource-observability-pattern/index.ts

View workflow job for this annotation

GitHub Actions / build (18)

Expected indentation of 12 spaces but found 14

Check failure on line 130 in lib/single-new-eks-opensource-observability-pattern/index.ts

View workflow job for this annotation

GitHub Actions / build (18)

Missing semicolon
addOns.push(new blueprints.addons.IstioBaseAddOn({
version: "1.18.2"
}));
addOns.push(new blueprints.addons.IstioControlPlaneAddOn(istioControlPlaneAddOnProps));
}

const mngProps: blueprints.MngClusterProviderProps = {
version: eks.KubernetesVersion.of("1.28"),
instanceTypes: [new ec2.InstanceType("m5.2xlarge")],
amiType: eks.NodegroupAmiType.AL2_X86_64,
desiredSize: 2,
maxSize: 3,
};

ObservabilityBuilder.builder()
.account(account)
.region(region)
.clusterProvider(new blueprints.MngClusterProvider(mngProps))
.resourceProvider(ampWorkspaceName, new blueprints.CreateAmpProvider(ampWorkspaceName, ampWorkspaceName))
.version('auto')
.withAmpProps(ampAddOnProps)
Expand Down

0 comments on commit 30b06de

Please sign in to comment.