From 5dab2440c85b9db9dab42053c71067e1c0869f3f Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Wed, 27 Sep 2023 11:04:25 +0200 Subject: [PATCH] Add obsctl-reloader row in the instance overview dashboard (#609) * Add obsctl-reloader row in the instance overview dash * Fix typo --- ...bs-instance-utilization-overview.libsonnet | 79 +++ ...stance-utilization-overview.configmap.yaml | 472 ++++++++++++++++++ 2 files changed, 551 insertions(+) diff --git a/observability/dashboards/rhobs-instance-utilization-overview.libsonnet b/observability/dashboards/rhobs-instance-utilization-overview.libsonnet index 017505f1ba..fe0cfc6bf4 100644 --- a/observability/dashboards/rhobs-instance-utilization-overview.libsonnet +++ b/observability/dashboards/rhobs-instance-utilization-overview.libsonnet @@ -120,6 +120,16 @@ function() { }, }, + obsctlReloader:: { + dashboard:: { + title: 'Observatorium - obsctl-reloader', + selector: std.join(', ', config.dashboard.selector + ['job=~"$job"']), + dimensions: std.join(', ', config.dashboard.dimensions + ['job']), + pod: 'rules-obsctl-reloader.*', + container: 'obsctl-reloader', + }, + }, + alerts:: { dashboard:: { selector: std.join(', ', ['service=~"observatorium.*|telemeter.*"']), @@ -738,6 +748,75 @@ function() { g.addDashboardLink(am.title) + { yaxes: g.yaxes('binBps') } ) { collapse: true } + ) + .addRow( + g.row('Obsctl Reloader Overview') + .addPanel( + g.panel('Rate of reloads', 'rate of rule reloads by obsctl-reloader') + + g.queryPanel( + [ + 'sum(rate(obsctl_reloader_prom_rule_sets_total{namespace=~"$namespace",job=~"$job"}[$__rate_interval])) by (namespace,job,pod)', + ], + [ + 'reloads {{pod}}', + ] + ) { span:: 0 } + + g.addDashboardLink(thanos.obsctlReloader.dashboard.title) + ) + .addPanel( + g.panel('Percentage of reload errors', 'Percentage of rule reloads that failed') + + g.queryPanel( + [ + '100 * sum(rate(obsctl_reloader_prom_rule_set_failures_total{namespace=~"$namespace",job=~"$job"}[$__rate_interval])) by (namespace,job, reason) / ignoring (job, reason) group_left sum(rate(obsctl_reloader_prom_rule_sets_total{namespace=~"$namespace",job=~"$job"}[$__rate_interval])) by (namespace) > 0', + ], + [ + 'reload error: {{reason}}', + ] + ) + { span:: 0 } + + { yaxes: g.yaxes('percent') } + + g.addDashboardLink(thanos.obsctlReloader.dashboard.title) + + g.stack + ) + .addPanel( + g.panel('Responses from Observatorium Rules API', 'Rate of responses from the Observatorium Rules API') + + g.queryPanel( + [ + 'sum(rate(obsctl_reloader_prom_rules_store_ops_total{namespace=~"$namespace",job=~"$job"}[$__rate_interval])) by (namespace,job,pod,status_code) > 0', + ], + [ + '{{status_code}} - pod {{pod}}', + ] + ) { span:: 0 } + + g.addDashboardLink(thanos.obsctlReloader.dashboard.title) + + g.stack + ) + .addPanel( + g.panel('Rate of fetches', 'Rate of rule fetches via PrometheusRule CRs from the local cluster') + + g.queryPanel( + [ + 'sum(rate(obsctl_reloader_prom_rule_fetches_total{namespace=~"$namespace",job=~"$job"}[$__rate_interval])) by (namespace,job,pod)', + ], + [ + 'fetches {{pod}}', + ] + ) { span:: 0 } + + g.addDashboardLink(thanos.obsctlReloader.dashboard.title) + ) + .addPanel( + g.panel('Percentage of failed fetches', 'Percentage of failed rule fetches via PrometheusRule CRs from the local cluster') + + g.queryPanel( + [ + '100 * sum(rate(obsctl_reloader_prom_rule_fetch_failures_total{namespace=~"$namespace",job=~"$job"}[$__rate_interval])) by (namespace,job,pod) / ignoring (job, pod) group_left sum(rate(obsctl_reloader_prom_rule_fetches_total{namespace=~"$namespace",job=~"$job"}[$__rate_interval])) by (namespace) > 0', + ], + [ + 'failed fetches {{pod}}', + ] + ) + + { span:: 0 } + + { yaxes: g.yaxes('percent') } + + g.addDashboardLink(thanos.obsctlReloader.dashboard.title) + ) { collapse: true } ) + { templating+: { list+: [ diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-rhobs-instance-utilization-overview.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-rhobs-instance-utilization-overview.configmap.yaml index bdd7526ca7..e0ccb7630c 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-rhobs-instance-utilization-overview.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-rhobs-instance-utilization-overview.configmap.yaml @@ -8230,6 +8230,478 @@ data: "showTitle": true, "title": "Alertmanager Overview", "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "rate of rule reloads by obsctl-reloader", + "fill": 1, + "id": 81, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "dashboard": "Observatorium - obsctl-reloader", + "includeVars": true, + "keepTime": true, + "title": "Observatorium - obsctl-reloader", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(obsctl_reloader_prom_rule_sets_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "reloads {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of reloads", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Percentage of rule reloads that failed", + "fill": 10, + "id": 82, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Observatorium - obsctl-reloader", + "includeVars": true, + "keepTime": true, + "title": "Observatorium - obsctl-reloader", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "100 * sum(rate(obsctl_reloader_prom_rule_set_failures_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job, reason) / ignoring (job, reason) group_left sum(rate(obsctl_reloader_prom_rule_sets_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "reload error: {{reason}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Percentage of reload errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Rate of responses from the Observatorium Rules API", + "fill": 10, + "id": 83, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Observatorium - obsctl-reloader", + "includeVars": true, + "keepTime": true, + "title": "Observatorium - obsctl-reloader", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(obsctl_reloader_prom_rules_store_ops_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod,status_code) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{status_code}} - pod {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Responses from Observatorium Rules API", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Rate of rule fetches via PrometheusRule CRs from the local cluster", + "fill": 1, + "id": 84, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "dashboard": "Observatorium - obsctl-reloader", + "includeVars": true, + "keepTime": true, + "title": "Observatorium - obsctl-reloader", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(obsctl_reloader_prom_rule_fetches_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "fetches {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of fetches", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Percentage of failed rule fetches via PrometheusRule CRs from the local cluster", + "fill": 1, + "id": 85, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "dashboard": "Observatorium - obsctl-reloader", + "includeVars": true, + "keepTime": true, + "title": "Observatorium - obsctl-reloader", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 * sum(rate(obsctl_reloader_prom_rule_fetch_failures_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod) / ignoring (job, pod) group_left sum(rate(obsctl_reloader_prom_rule_fetches_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "failed fetches {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Percentage of failed fetches", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Obsctl Reloader Overview", + "titleSize": "h6" } ], "schemaVersion": 14,