From 31ff388f30d8b86358f0385af38c0b4dd0916190 Mon Sep 17 00:00:00 2001 From: Stephen Lang Date: Mon, 8 Apr 2024 17:17:26 +0100 Subject: [PATCH] fix(dashboards): Port multi-cluster dashboard to new grafonnet library Signed-off-by: Stephen Lang --- config.libsonnet | 46 +-- dashboards/resources/multi-cluster.libsonnet | 391 ++++++++++++++----- 2 files changed, 319 insertions(+), 118 deletions(-) diff --git a/config.libsonnet b/config.libsonnet index 41a94f144..43c4a9822 100644 --- a/config.libsonnet +++ b/config.libsonnet @@ -34,29 +34,29 @@ // Grafana dashboard IDs are necessary for stable links for dashboards grafanaDashboardIDs: { - 'k8s-resources-multicluster.json': '1gBgaexoVZ4TpBNAt2eGRsc4LNjNhdjcZd6cqU6S', - 'k8s-resources-cluster.json': 'ZnbvYbcXkob7GLqcDPLTj1ZL4MRX87tOh8xdr831', - 'k8s-resources-namespace.json': 'XaY4UCP3J51an4ikqtkUGBSjLpDW4pg39xe2FuxP', - 'k8s-resources-pod.json': 'wU56sdGSNYZTL3eO0db3pONtVmTvsyV7w8aadbYF', - 'k8s-multicluster-rsrc-use.json': 'NJ9AlnsObVgj9uKiJMeAqfzMi1wihOMupcsDhlhR', - 'k8s-cluster-rsrc-use.json': 'uXQldxzqUNgIOUX6FyZNvqgP2vgYb78daNu4GiDc', - 'k8s-node-rsrc-use.json': 'E577CMUOwmPsxVVqM9lj40czM1ZPjclw7hGa7OT7', - 'nodes.json': 'kcb9C2QDe4IYcjiTOmYyfhsImuzxRcvwWC3YLJPS', - 'persistentvolumesusage.json': 'AhCeikee0xoa6faec0Weep2nee6shaiquigahw8b', - 'pods.json': 'AMK9hS0rSbSz7cKjPHcOtk6CGHFjhSHwhbQ3sedK', - 'statefulset.json': 'dPiBt0FRG5BNYo0XJ4L0Meoc7DWs9eL40c1CRc1g', - 'k8s-resources-windows-cluster.json': '4d08557fd9391b100730f2494bccac68', - 'k8s-resources-windows-namespace.json': '490b402361724ab1d4c45666c1fa9b6f', - 'k8s-resources-windows-pod.json': '40597a704a610e936dc6ed374a7ce023', - 'k8s-windows-cluster-rsrc-use.json': '53a43377ec9aaf2ff64dfc7a1f539334', - 'k8s-windows-node-rsrc-use.json': '96e7484b0bb53b74fbc2bcb7723cd40b', - 'k8s-resources-workloads-namespace.json': 'L29WgMrccBDauPs3Xsti3fwaKjMB6fReufWj6Gl1', - 'k8s-resources-workload.json': 'hZCNbUPfUqjc95N3iumVsaEVHXzaBr3IFKRFvUJf', - 'apiserver.json': 'eswbt59QCroA3XLdKFvdOHlKB8Iks3h7d2ohstxr', - 'controller-manager.json': '5g73oHG0pCRz4X1t6gNYouVUv9urrQd4wCdHR2mI', - 'scheduler.json': '4uMPZ9jmwvYJcM5fcNcNrrt9Sf6ufQL4IKFri2Gp', - 'proxy.json': 'hhT4orXD1Ott4U1bNNps0R26EHTwMypdcaCjDRPM', - 'kubelet.json': 'B1azll2ETo7DTiM8CysrH6g4s5NCgkOz6ZdU8Q0j', + 'k8s-resources-multicluster.json': std.md5('k8s-resources-multicluster.json'), + 'k8s-resources-cluster.json': std.md5('k8s-resources-cluster.json'), + 'k8s-resources-namespace.json': std.md5('k8s-resources-namespace.json'), + 'k8s-resources-pod.json': std.md5('k8s-resources-pod.json'), + 'k8s-multicluster-rsrc-use.json': std.md5('k8s-multicluster-rsrc-use.json'), + 'k8s-cluster-rsrc-use.json': std.md5('k8s-cluster-rsrc-use.json'), + 'k8s-node-rsrc-use.json': std.md5('k8s-node-rsrc-use.json'), + 'nodes.json': std.md5('nodes.json'), + 'persistentvolumesusage.json': std.md5('persistentvolumesusage.json'), + 'pods.json': std.md5('pods.json'), + 'statefulset.json': std.md5('statefulset.json'), + 'k8s-resources-windows-cluster.json': std.md5('k8s-resources-windows-cluster.json'), + 'k8s-resources-windows-namespace.json': std.md5('k8s-resources-windows-namespace.json'), + 'k8s-resources-windows-pod.json': std.md5('k8s-resources-windows-pod.json'), + 'k8s-windows-cluster-rsrc-use.json': std.md5('k8s-windows-cluster-rsrc-use.json'), + 'k8s-windows-node-rsrc-use.json': std.md5('k8s-windows-node-rsrc-use.json'), + 'k8s-resources-workloads-namespace.json': std.md5('k8s-resources-workloads-namespace.json'), + 'k8s-resources-workload.json': std.md5('k8s-resources-workload.json'), + 'apiserver.json': std.md5('apiserver.json'), + 'controller-manager.json': std.md5('controller-manager.json'), + 'scheduler.json': std.md5('scheduler.json'), + 'proxy.json': std.md5('proxy.json'), + 'kubelet.json': std.md5('kubelet.json'), }, // Support for Grafana 7.2+ `$__rate_interval` instead of `$__interval` diff --git a/dashboards/resources/multi-cluster.libsonnet b/dashboards/resources/multi-cluster.libsonnet index b13276c3c..4ff4c5afe 100644 --- a/dashboards/resources/multi-cluster.libsonnet +++ b/dashboards/resources/multi-cluster.libsonnet @@ -1,107 +1,308 @@ -local g = import 'github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet'; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +local prometheus = g.query.prometheus; +local stat = g.panel.stat; +local table = g.panel.table; +local timeSeries = g.panel.timeSeries; +local var = g.dashboard.variable; { + local statPanel(title, unit, query) = + stat.new(title) + + stat.options.withColorMode('none') + + stat.standardOptions.withUnit(unit) + + stat.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval) + + stat.queryOptions.withTargets([ + prometheus.new('${datasource}', query) + + prometheus.withInstant(true), + ]), + + local tsPanel = + timeSeries { + new(title): + timeSeries.new(title) + + timeSeries.options.legend.withShowLegend() + + timeSeries.options.legend.withAsTable() + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('right') + + timeSeries.options.tooltip.withMode('single') + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval), + }, + grafanaDashboards+:: if $._config.showMultiCluster then { 'k8s-resources-multicluster.json': - local tableStyles = { - [$._config.clusterLabel]: { - alias: 'Cluster', - link: '%(prefix)s/d/%(uid)s/k8s-resources-cluster?var-datasource=$datasource&var-cluster=$__cell' % { prefix: $._config.grafanaK8s.linkPrefix, uid: std.md5('k8s-resources-cluster.json') }, + local variables = { + datasource: + var.datasource.new('datasource', 'prometheus') + + var.datasource.withRegex($._config.datasourceFilterRegex) + + var.datasource.generalOptions.showOnDashboard.withLabelAndValue() + + var.datasource.generalOptions.withLabel('Data source') + + { + current: { + selected: true, + text: $._config.datasourceName, + value: $._config.datasourceName, + }, + }, + }; + + local links = { + cluster: { + title: 'Drill down', + url: '%(prefix)s/d/%(uid)s/kubernetes-compute-resources-cluster?${datasource:queryparam}&var-cluster=${__data.fields.Cluster}' % { + uid: $._config.grafanaDashboardIDs['k8s-resources-cluster.json'], + prefix: $._config.grafanaK8s.linkPrefix, + }, }, }; - g.dashboard( - '%(dashboardNamePrefix)sCompute Resources / Multi-Cluster' % $._config.grafanaK8s, - uid=($._config.grafanaDashboardIDs['k8s-resources-multicluster.json']), - datasource_regex=$._config.datasourceFilterRegex, - datasource=$._config.datasourceName, - ).addRow( - (g.row('Headlines') + - { - height: '100px', - showTitle: false, - }) - .addPanel( - g.panel('CPU Utilisation') + - g.statPanel('cluster:node_cpu:ratio_rate5m') - ) - .addPanel( - g.panel('CPU Requests Commitment') + - g.statPanel('sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="cpu"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s, resource="cpu"})' % $._config) - ) - .addPanel( - g.panel('CPU Limits Commitment') + - g.statPanel('sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="cpu"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s, resource="cpu"})' % $._config) - ) - .addPanel( - g.panel('Memory Utilisation') + - g.statPanel('1 - sum(:node_memory_MemAvailable_bytes:sum) / sum(node_memory_MemTotal_bytes{%(nodeExporterSelector)s})' % $._config) - ) - .addPanel( - g.panel('Memory Requests Commitment') + - g.statPanel('sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="memory"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s, resource="memory"})' % $._config) - ) - .addPanel( - g.panel('Memory Limits Commitment') + - g.statPanel('sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="memory"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s, resource="memory"})' % $._config) - ) - ) - .addRow( - g.row('CPU') - .addPanel( - g.panel('CPU Usage') + - g.queryPanel('sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate) by (%(clusterLabel)s)' % $._config, '{{%(clusterLabel)s}}' % $._config) - + { fill: 0, linewidth: 2 }, - ) - ) - .addRow( - g.row('CPU Quota') - .addPanel( - g.panel('CPU Quota') + - g.tablePanel([ - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate) by (%(clusterLabel)s)' % $._config, - 'sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config, - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate) by (%(clusterLabel)s) / sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config, - 'sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config, - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate) by (%(clusterLabel)s) / sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config, - ], tableStyles { - 'Value #A': { alias: 'CPU Usage' }, - 'Value #B': { alias: 'CPU Requests' }, - 'Value #C': { alias: 'CPU Requests %', unit: 'percentunit' }, - 'Value #D': { alias: 'CPU Limits' }, - 'Value #E': { alias: 'CPU Limits %', unit: 'percentunit' }, - }) - ) - ) - .addRow( - g.row('Memory') - .addPanel( - g.panel('Memory Usage (w/o cache)') + - // Not using container_memory_usage_bytes here because that includes page cache - g.queryPanel('sum(container_memory_rss{%(cadvisorSelector)s, container!=""}) by (%(clusterLabel)s)' % $._config, '{{%(clusterLabel)s}}' % $._config) + - { fill: 0, linewidth: 2, yaxes: g.yaxes('bytes') }, - ) - ) - .addRow( - g.row('Memory Requests') - .addPanel( - g.panel('Requests by Cluster') + - g.tablePanel([ + local panels = { + highlights: [ + statPanel( + 'CPU Utilisation', + 'none', + 'cluster:node_cpu:ratio_rate5m' + ), + + statPanel( + 'CPU Requests Commitment', + 'percentunit', + 'sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="cpu"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s, resource="cpu"})' % $._config + ), + + statPanel( + 'CPU Limits Commitment', + 'percentunit', + 'sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="cpu"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s, resource="cpu"})' % $._config + ), + + statPanel( + 'Memory Utilisation', + 'percentunit', + '1 - sum(:node_memory_MemAvailable_bytes:sum) / sum(node_memory_MemTotal_bytes{%(nodeExporterSelector)s})' % $._config + ), + + statPanel( + 'Memory Requests Commitment', + 'percentunit', + 'sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="memory"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s, resource="memory"})' % $._config + ), + + statPanel( + 'Memory Limits Commitment', + 'percentunit', + 'sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="memory"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s, resource="memory"})' % $._config + ), + ], + + cpuUsage: [ + tsPanel.new('CPU Usage') + + tsPanel.queryOptions.withTargets([ + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate) by (%(clusterLabel)s)' % $._config) + + prometheus.withLegendFormat('__auto'), + ]), + ], + + cpuQuota: [ + g.panel.table.new('CPU Quota') + + g.panel.table.queryOptions.withTargets([ + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate) by (%(clusterLabel)s)' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), + prometheus.new('${datasource}', 'sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate) by (%(clusterLabel)s) / sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), + prometheus.new('${datasource}', 'sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), + prometheus.new('${datasource}', 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate) by (%(clusterLabel)s) / sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="cpu"}) by (%(clusterLabel)s)' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), + ]) + + g.panel.table.queryOptions.withTransformations([ + g.panel.table.queryOptions.transformation.withId('joinByField') + + g.panel.table.queryOptions.transformation.withOptions({ + byField: 'cluster', + mode: 'outer', + }), + + g.panel.table.queryOptions.transformation.withId('organize') + + g.panel.table.queryOptions.transformation.withOptions({ + excludeByName: { + 'Time 1': true, + 'Time 2': true, + 'Time 3': true, + 'Time 4': true, + 'Time 5': true, + }, + indexByName: { + 'Time 1': 0, + 'Time 2': 1, + 'Time 3': 2, + 'Time 4': 3, + 'Time 5': 4, + cluster: 5, + 'Value #A': 6, + 'Value #B': 7, + 'Value #C': 8, + 'Value #D': 9, + 'Value #E': 10, + }, + renameByName: { + cluster: 'Cluster', + 'Value #A': 'CPU Usage', + 'Value #B': 'CPU Requests', + 'Value #C': 'CPU Requests %', + 'Value #D': 'CPU Limits', + 'Value #E': 'CPU Limits %', + }, + }), + ]) + + + g.panel.table.standardOptions.withOverrides([ + { + matcher: { + id: 'byRegexp', + options: '/%/', + }, + properties: [ + { + id: 'unit', + value: 'percentunit', + }, + ], + }, + { + matcher: { + id: 'byName', + options: 'Cluster', + }, + properties: [ + { + id: 'links', + value: [links.cluster], + }, + ], + }, + ]), + ], + + memoryUsage: [ + tsPanel.new('Memory Usage (w/o cache)') + + tsPanel.standardOptions.withUnit('bytes') + + tsPanel.queryOptions.withTargets([ // Not using container_memory_usage_bytes here because that includes page cache - 'sum(container_memory_rss{%(cadvisorSelector)s, container!=""}) by (%(clusterLabel)s)' % $._config, - 'sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="memory"}) by (%(clusterLabel)s)' % $._config, - 'sum(container_memory_rss{%(cadvisorSelector)s, container!=""}) by (%(clusterLabel)s) / sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="memory"}) by (%(clusterLabel)s)' % $._config, - 'sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="memory"}) by (%(clusterLabel)s)' % $._config, - 'sum(container_memory_rss{%(cadvisorSelector)s, container!=""}) by (%(clusterLabel)s) / sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="memory"}) by (%(clusterLabel)s)' % $._config, - ], tableStyles { - 'Value #A': { alias: 'Memory Usage', unit: 'bytes' }, - 'Value #B': { alias: 'Memory Requests', unit: 'bytes' }, - 'Value #C': { alias: 'Memory Requests %', unit: 'percentunit' }, - 'Value #D': { alias: 'Memory Limits', unit: 'bytes' }, - 'Value #E': { alias: 'Memory Limits %', unit: 'percentunit' }, - }) - ) + prometheus.new('${datasource}', 'sum(container_memory_rss{%(cadvisorSelector)s, container!=""}) by (%(clusterLabel)s)' % $._config) + + prometheus.withLegendFormat('__auto'), + ]), + ], + + memoryRequests: [ + g.panel.table.new('Memory Requests by Cluster') + + g.panel.table.standardOptions.withUnit('bytes') + + g.panel.table.queryOptions.withTargets([ + prometheus.new('${datasource}', 'sum(container_memory_rss{%(cadvisorSelector)s, container!=""}) by (%(clusterLabel)s)' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), + prometheus.new('${datasource}', 'sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="memory"}) by (%(clusterLabel)s)' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), + prometheus.new('${datasource}', 'sum(container_memory_rss{%(cadvisorSelector)s, container!=""}) by (%(clusterLabel)s) / sum(kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, resource="memory"}) by (%(clusterLabel)s)' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), + prometheus.new('${datasource}', 'sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="memory"}) by (%(clusterLabel)s)' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), + prometheus.new('${datasource}', 'sum(container_memory_rss{%(cadvisorSelector)s, container!=""}) by (%(clusterLabel)s) / sum(kube_pod_container_resource_limits{%(kubeStateMetricsSelector)s, resource="memory"}) by (%(clusterLabel)s)' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), + ]) + + g.panel.table.queryOptions.withTransformations([ + g.panel.table.queryOptions.transformation.withId('joinByField') + + g.panel.table.queryOptions.transformation.withOptions({ + byField: 'cluster', + mode: 'outer', + }), + + g.panel.table.queryOptions.transformation.withId('organize') + + g.panel.table.queryOptions.transformation.withOptions({ + excludeByName: { + 'Time 1': true, + 'Time 2': true, + 'Time 3': true, + 'Time 4': true, + 'Time 5': true, + }, + indexByName: { + 'Time 1': 0, + 'Time 2': 1, + 'Time 3': 2, + 'Time 4': 3, + 'Time 5': 4, + cluster: 5, + 'Value #A': 6, + 'Value #B': 7, + 'Value #C': 8, + 'Value #D': 9, + 'Value #E': 10, + }, + renameByName: { + cluster: 'Cluster', + 'Value #A': 'Memory Usage', + 'Value #B': 'Memory Requests', + 'Value #C': 'Memory Requests %', + 'Value #D': 'Memory Limits', + 'Value #E': 'Memory Limits %', + }, + }), + ]) + + + g.panel.table.standardOptions.withOverrides([ + { + matcher: { + id: 'byRegexp', + options: '/%/', + }, + properties: [ + { + id: 'unit', + value: 'percentunit', + }, + ], + }, + { + matcher: { + id: 'byName', + options: 'Cluster', + }, + properties: [ + { + id: 'links', + value: [links.cluster], + }, + ], + }, + ]), + ], + }; + + g.dashboard.new('%(dashboardNamePrefix)sCompute Resources / Multi-Cluster' % $._config.grafanaK8s) + + g.dashboard.withUid($._config.grafanaDashboardIDs['k8s-resources-multicluster.json']) + + g.dashboard.withEditable(false) + + g.dashboard.time.withFrom('now-1h') + + g.dashboard.time.withTo('now') + + g.dashboard.withVariables([variables.datasource]) + + g.dashboard.withPanels( + g.util.grid.wrapPanels(panels.highlights, panelWidth=4, panelHeight=3, startY=0) + + g.util.grid.wrapPanels(panels.cpuUsage, panelWidth=24, panelHeight=7, startY=1) + + g.util.grid.wrapPanels(panels.cpuQuota, panelWidth=24, panelHeight=7, startY=2) + + g.util.grid.wrapPanels(panels.memoryUsage, panelWidth=24, panelHeight=7, startY=3) + + g.util.grid.wrapPanels(panels.memoryRequests, panelWidth=24, panelHeight=7, startY=4) ), } else {}, }