Skip to content

Commit

Permalink
Add obsctl-reloader row in the instance overview dashboard (#609)
Browse files Browse the repository at this point in the history
* Add obsctl-reloader row in the instance overview dash

* Fix typo
  • Loading branch information
douglascamata authored Sep 27, 2023
1 parent dfc913a commit 5dab244
Show file tree
Hide file tree
Showing 2 changed files with 551 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,16 @@ function() {
},
},

obsctlReloader:: {
dashboard:: {
title: 'Observatorium - obsctl-reloader',
selector: std.join(', ', config.dashboard.selector + ['job=~"$job"']),
dimensions: std.join(', ', config.dashboard.dimensions + ['job']),
pod: 'rules-obsctl-reloader.*',
container: 'obsctl-reloader',
},
},

alerts:: {
dashboard:: {
selector: std.join(', ', ['service=~"observatorium.*|telemeter.*"']),
Expand Down Expand Up @@ -738,6 +748,75 @@ function() {
g.addDashboardLink(am.title) +
{ yaxes: g.yaxes('binBps') }
) { collapse: true }
)
.addRow(
g.row('Obsctl Reloader Overview')
.addPanel(
g.panel('Rate of reloads', 'rate of rule reloads by obsctl-reloader') +
g.queryPanel(
[
'sum(rate(obsctl_reloader_prom_rule_sets_total{namespace=~"$namespace",job=~"$job"}[$__rate_interval])) by (namespace,job,pod)',
],
[
'reloads {{pod}}',
]
) { span:: 0 } +
g.addDashboardLink(thanos.obsctlReloader.dashboard.title)
)
.addPanel(
g.panel('Percentage of reload errors', 'Percentage of rule reloads that failed') +
g.queryPanel(
[
'100 * sum(rate(obsctl_reloader_prom_rule_set_failures_total{namespace=~"$namespace",job=~"$job"}[$__rate_interval])) by (namespace,job, reason) / ignoring (job, reason) group_left sum(rate(obsctl_reloader_prom_rule_sets_total{namespace=~"$namespace",job=~"$job"}[$__rate_interval])) by (namespace) > 0',
],
[
'reload error: {{reason}}',
]
)
{ span:: 0 } +
{ yaxes: g.yaxes('percent') } +
g.addDashboardLink(thanos.obsctlReloader.dashboard.title) +
g.stack
)
.addPanel(
g.panel('Responses from Observatorium Rules API', 'Rate of responses from the Observatorium Rules API') +
g.queryPanel(
[
'sum(rate(obsctl_reloader_prom_rules_store_ops_total{namespace=~"$namespace",job=~"$job"}[$__rate_interval])) by (namespace,job,pod,status_code) > 0',
],
[
'{{status_code}} - pod {{pod}}',
]
) { span:: 0 } +
g.addDashboardLink(thanos.obsctlReloader.dashboard.title) +
g.stack
)
.addPanel(
g.panel('Rate of fetches', 'Rate of rule fetches via PrometheusRule CRs from the local cluster') +
g.queryPanel(
[
'sum(rate(obsctl_reloader_prom_rule_fetches_total{namespace=~"$namespace",job=~"$job"}[$__rate_interval])) by (namespace,job,pod)',
],
[
'fetches {{pod}}',
]
) { span:: 0 } +
g.addDashboardLink(thanos.obsctlReloader.dashboard.title)
)
.addPanel(
g.panel('Percentage of failed fetches', 'Percentage of failed rule fetches via PrometheusRule CRs from the local cluster') +
g.queryPanel(
[
'100 * sum(rate(obsctl_reloader_prom_rule_fetch_failures_total{namespace=~"$namespace",job=~"$job"}[$__rate_interval])) by (namespace,job,pod) / ignoring (job, pod) group_left sum(rate(obsctl_reloader_prom_rule_fetches_total{namespace=~"$namespace",job=~"$job"}[$__rate_interval])) by (namespace) > 0',
],
[
'failed fetches {{pod}}',
]
) +
{ span:: 0 } +
{ yaxes: g.yaxes('percent') } +
g.addDashboardLink(thanos.obsctlReloader.dashboard.title)
) { collapse: true }
) + {
templating+: {
list+: [
Expand Down
Loading

0 comments on commit 5dab244

Please sign in to comment.