diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 66da677b55..362aa40d97 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana" } }, - "version": "d039275e4916aceae1c137120882e01d857787ac", - "sum": "515vMn4x4tP8vegL4HLW0nDO5+njGTgnDZB5OOhtsCI=" + "version": "5698c8940b6dadca3f42107b7839557bc041761f", + "sum": "l6fPvh3tW6fWot308w71QY/amrYsFPeitvz1IgJxqQA=" }, { "source": { @@ -18,8 +18,8 @@ "subdir": "contrib/mixin" } }, - "version": "08f4c340eb92338d0f266795222d9ea04cb69056", - "sum": "zhLYhUNcXNkMRfJhMUX0UiOpi8TOuLmUqJfO9NFKFkg=" + "version": "19aa0dbe8fd6317a237bae9b6ea52a4f1b445b19", + "sum": "IXI3LQIT9NmTPJAk8WLUJd5+qZfcGpeNCyWIK7oEpws=" }, { "source": { @@ -38,8 +38,8 @@ "subdir": "grafonnet" } }, - "version": "daad85cf3fad3580e58029414630e29956aefe21", - "sum": "zkOBVXtNSGlOdbm5TRCbEik7c/Jk+btbJqaE9qW8j3Y=" + "version": "a1d61cce1da59c71409b99b5c7568511fec661ea", + "sum": "342u++/7rViR/zj2jeJOjshzglkZ1SY+hFNuyCBFMdc=" }, { "source": { @@ -48,9 +48,39 @@ "subdir": "grafonnet-7.0" } }, - "version": "6db00c292d3a1c71661fc875f90e0ec7caa538c2", + "version": "a1d61cce1da59c71409b99b5c7568511fec661ea", "sum": "gCtR9s/4D5fxU9aKXg0Bru+/njZhA0YjLjPiASc61FM=" }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-latest" + } + }, + "version": "82a19822e54a0a12a51e24dbd48fcde717dc0864", + "sum": "64fMUPI3frXGj4X1FqFd1t7r04w3CUSmXaDcJ23EYbQ=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-v10.0.0" + } + }, + "version": "82a19822e54a0a12a51e24dbd48fcde717dc0864", + "sum": "xdcrJPJlpkq4+5LpGwN4tPAuheNNLXZjE6tDcyvFjr0=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-v11.1.0" + } + }, + "version": "82a19822e54a0a12a51e24dbd48fcde717dc0864", + "sum": "41w7p/rwrNsITqNHMXtGSJAfAyKmnflg6rFhKBduUxM=" + }, { "source": { "git": { @@ -58,7 +88,7 @@ "subdir": "consul" } }, - "version": "18a7aa2cd2154057a16d4667f6b1debda8bc50a6", + "version": "5a6b86b475e427b2dbd9e4af0bcafbb6da0507a5", "sum": "Po3c1Ic96ngrJCtOazic/7OsLkoILOKZWXWyZWl+od8=" }, { @@ -68,8 +98,8 @@ "subdir": "grafana-builder" } }, - "version": "4d4b5b1ce01003547a110f93cc86b8b7afb282a6", - "sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8=" + "version": "5a6b86b475e427b2dbd9e4af0bcafbb6da0507a5", + "sum": "yxqWcq/N3E/a/XreeU6EuE6X7kYPnG0AspAQFKOjASo=" }, { "source": { @@ -78,8 +108,8 @@ "subdir": "jaeger-agent-mixin" } }, - "version": "3b08e7d37511dfd39af6027d07788a5ca8ec71b1", - "sum": "nsukyr2SS8h97I2mxvBazXZp2fxu1i6eg+rKq3/NRwY=" + "version": "5a6b86b475e427b2dbd9e4af0bcafbb6da0507a5", + "sum": "h/nMdmRK8Ep4bnRXKOuVL+RlUm85hhtD0LGIsf2THAY=" }, { "source": { @@ -88,7 +118,7 @@ "subdir": "ksonnet-util" } }, - "version": "18a7aa2cd2154057a16d4667f6b1debda8bc50a6", + "version": "5a6b86b475e427b2dbd9e4af0bcafbb6da0507a5", "sum": "0y3AFX9LQSpfWTxWKSwoLgbt0Wc9nnCwhMH2szKzHv0=" }, { @@ -98,8 +128,8 @@ "subdir": "memcached" } }, - "version": "18a7aa2cd2154057a16d4667f6b1debda8bc50a6", - "sum": "SWywAq4U0MRPMbASU0Ez8O9ArRNeoZzb75sEuReueow=" + "version": "5a6b86b475e427b2dbd9e4af0bcafbb6da0507a5", + "sum": "Cc715Y3rgTuimgDFIw+FaKzXSJGRYwt1pFTMbdrNBD8=" }, { "source": { @@ -108,8 +138,8 @@ "subdir": "memcached-mixin" } }, - "version": "de586e2ac76e9bcee87d34f0042abe1a2ef7cdf3", - "sum": "f9TjLnCiqj5BQ4QiXhrAh8lRrKotAgq3EZU5Zu2zFl4=" + "version": "5a6b86b475e427b2dbd9e4af0bcafbb6da0507a5", + "sum": "nhwKgMyvN5NOoRk+YrSv+HNPrZQmzo4O35zfpep3Jcw=" }, { "source": { @@ -118,8 +148,8 @@ "subdir": "mixin-utils" } }, - "version": "881db2241f0c5007c3e831caf34b0c645202b4ab", - "sum": "Je2SxBKu+1WrKEEG60zjSKaY/6TPX8uRz5bsaw0a8oA=" + "version": "5a6b86b475e427b2dbd9e4af0bcafbb6da0507a5", + "sum": "LoYq5QxJmUXEtqkEG8CFUBLBhhzDDaNANHc7Gz36ZdM=" }, { "source": { @@ -148,8 +178,8 @@ "subdir": "operations/mimir-mixin" } }, - "version": "0421daaa5ffd91c601a130331e1124a4b615695d", - "sum": "vvTsA5XdK4oBTCcin7hbI1eGpeEn+vSH9KZ0ZsBdHxE=" + "version": "72cbd836389c935ad79419f8124de73796bbb42a", + "sum": "XCjsn/YmHaPHBrJv7q62aVQ3gvRXiQLGNgaa6p+Mitk=" }, { "source": { @@ -158,8 +188,8 @@ "subdir": "doc-util" } }, - "version": "2eae33a828320269c42acf38e808479a33e416db", - "sum": "lppHbNARpG3YTpuSv94X9TyIE9TfV3CyTVceIHSRxpc=" + "version": "6ac6c69685b8c29c54515448eaca583da2d88150", + "sum": "BrAL/k23jq+xy9oA7TWIhUx07dsA/QLm3g7ktCwe//U=" }, { "source": { @@ -168,8 +198,18 @@ "subdir": "1.26" } }, - "version": "9e5b48eee32913938d3cac30f183b49ecd9fe13a", - "sum": "7pl3HQqiKg4zJ0dWFqMo9yMGDEvlVdxgPGr1rMm0/LE=" + "version": "6ecbb7709baf27f44b2e48f3529741ae6754ae6a", + "sum": "fLShxv/gGx8+Jay43Kb0hjvGXCrwJ+2XJ1Hoir2yBZM=" + }, + { + "source": { + "git": { + "remote": "https://github.com/jsonnet-libs/xtd.git", + "subdir": "" + } + }, + "version": "63d430b69a95741061c2f7fc9d84b1a778511d9c", + "sum": "qiZi3axUSXCVzKUF83zSAxklwrnitMmrDK4XAfjPMdE=" }, { "source": { @@ -189,8 +229,8 @@ "subdir": "" } }, - "version": "3422a511bf8f645e3e684632785b27864ee5dc0c", - "sum": "tpgokDM1s/6CL4p+tlq3Nu54r62/kPfGnLUKRgYIC4k=" + "version": "bdbf7f45cedf37d07567be7519fa4139043f9335", + "sum": "j4EAKfqkbPvBFGnBjt4hex2bdNHPpuFWrCxfq5L6EkU=" }, { "source": { @@ -199,8 +239,8 @@ "subdir": "jsonnet/kube-state-metrics" } }, - "version": "7e28ed2722a8edfade41238ee3f0e0117ff041b6", - "sum": "P0dCnbzyPScQGNXwXRcwiPkMLeTq0IPNbSTysDbySnM=" + "version": "17151aca659e0659259b5e1f5675acf849281ade", + "sum": "lO7jUSzAIy8Yk9pOWJIWgPRhubkWzVh56W6wtYfbVH4=" }, { "source": { @@ -209,8 +249,8 @@ "subdir": "jsonnet/kube-state-metrics-mixin" } }, - "version": "7e28ed2722a8edfade41238ee3f0e0117ff041b6", - "sum": "u8gaydJoxEjzizQ8jY8xSjYgWooPmxw+wIWdDxifMAk=" + "version": "17151aca659e0659259b5e1f5675acf849281ade", + "sum": "qclI7LwucTjBef3PkGBkKxF0mfZPbHnn4rlNWKGtR4c=" }, { "source": { @@ -240,7 +280,7 @@ "subdir": "jsonnet/lib" } }, - "version": "7afc79f5db2d952e41462afe98c40e7cd13e2362", + "version": "d8c38f7d004e9fecc5a9fee3b77f4f1bbcf53a21", "sum": "HBkOxudsp1b5XmmCYGu9Yuv+UGFLimHi+fttA/rVTP4=", "name": "rules-objstore" }, @@ -251,8 +291,8 @@ "subdir": "jsonnet/lib" } }, - "version": "02aec09ce44b2f26ec9364469c2c6396f58702eb", - "sum": "kDkLgDFudmDJhEYpMjInoy5Kzz+OedfMNFGpQ9fteG4=", + "version": "7140e9476289b57b815692c3ec2dfd95b5fb4b6b", + "sum": "KG4fosMr9R2szMfkrIadwZts3OqHg7E4aQqzMdg+og8=", "name": "thanos-receive-controller" }, { @@ -262,7 +302,7 @@ "subdir": "jsonnet/thanos-receive-controller-mixin" } }, - "version": "89ba95dea87092d01777e77b5a636ef497b32e86", + "version": "7140e9476289b57b815692c3ec2dfd95b5fb4b6b", "sum": "SvBm5veTRA3P8YicTBC0XHjGZ8x867h+4bS6PKWB+Zc=" }, { @@ -272,8 +312,8 @@ "subdir": "jsonnet/lib" } }, - "version": "fb2d4139e10a4baaf4f7250fcc9b1ceb0aa01b0a", - "sum": "c+ywXdmRYJpWnCzAh85n4AWMcxj+sqx5rNHb3lzYuEY=" + "version": "f5e3403646c40808283ef34126e22f564a9ef2b0", + "sum": "zVUwcryZZp6av50NB4U2vZIoIx6siiEyWZxSevcnU+g=" }, { "source": { @@ -282,7 +322,7 @@ "subdir": "jsonnet" } }, - "version": "ec9ecc69c91cce9d6f6d7ce4c0054424c697a8a4", + "version": "e1e1857b0b6e796268005f2cc8f73e760b7dbf34", "sum": "0FKabnXd0rMeu8YpkkopEOknqBf5PLq/DIIDd0ve7cU=", "name": "up" }, @@ -293,8 +333,8 @@ "subdir": "jsonnet/telemeter" } }, - "version": "0453f452b7ca677e3be10faa95bea714733b4830", - "sum": "0m1kvO0SH4YoldGek69nhBGMTPxYz3gYcyNhfrymAOE=" + "version": "700802fe29aba4ed602309f0c68222c4bedb08f5", + "sum": "53k2pqb4mjABCulTT/H5d7pQDiemIOO23gzfon7g9K4=" }, { "source": { @@ -303,8 +343,8 @@ "subdir": "jsonnet/kube-prometheus" } }, - "version": "1c19d2a26167927459dec3778dfd21050052382f", - "sum": "FwTPXq9GLUUF49qFDvL9h7kISvKx+dxxju+M1RjWMP8=" + "version": "c503e5cc5403dd5d56b1c0c5933827baee64aeaf", + "sum": "fJqINQiYJPmllXFFO+Hl5HrPYANMbhHFUQ28tl0Vi00=" }, { "source": { @@ -313,7 +353,7 @@ "subdir": "jsonnet/kube-prometheus/lib" } }, - "version": "f737ac4d012c05b0e1f187f247295923ba9560b6", + "version": "c503e5cc5403dd5d56b1c0c5933827baee64aeaf", "sum": "QKRgrgEZ3k9nLmLCrDBaeIGVqQZf+AvZTcnhdLk3TrA=" }, { @@ -323,8 +363,8 @@ "subdir": "jsonnet/mixin" } }, - "version": "b3e71e5e9942b9fd1cda58c295ee34c2be0bc71c", - "sum": "GQmaVFJwKMiD/P4n3N2LrAZVcwutriWrP8joclDtBYQ=", + "version": "e951bd3037a053fea681510ccde211c28dc657e1", + "sum": "gi+knjdxs2T715iIQIntrimbHRgHnpM8IFBJDD1gYfs=", "name": "prometheus-operator-mixin" }, { @@ -334,8 +374,8 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "b3e71e5e9942b9fd1cda58c295ee34c2be0bc71c", - "sum": "n0Fb2DTfaxGkwXgN7cy4NaVgLt+5+ROfiOWGryxUDUM=" + "version": "e951bd3037a053fea681510ccde211c28dc657e1", + "sum": "YOJjmladGD1PcgNae0h88Mm235CsZSfwf2a4DIcMJFU=" }, { "source": { @@ -344,8 +384,8 @@ "subdir": "doc/alertmanager-mixin" } }, - "version": "f958b8be84b870e363f7dafcbeb807b463269a75", - "sum": "f3iZDUXQ/YWB5yDCY7VLD5bs442+3CdJgXJhJyWhNf8=" + "version": "f6b942cf9b3a503d59192eada300d2ad97cba82f", + "sum": "Mf4h1BYLle2nrgjf/HXrBbl0Zk8N+xaoEM017o0BC+k=" }, { "source": { @@ -354,8 +394,8 @@ "subdir": "docs/node-mixin" } }, - "version": "c6e1a5b74277557045f0620e6d37259a291cb03b", - "sum": "+ZeoFzdjV7GKrrs0Bf6a+M+QDikd5QhcxTnFRObA0/w=" + "version": "49d177bf95417b117ab612a376e2434d5dd61c2d", + "sum": "cQCW+1N0Xae5yXecCWDK2oAlN0luBS/5GrwBYSlaFms=" }, { "source": { @@ -364,19 +404,20 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "1dd247f68bb1612469e703f1c363fe70597b0be2", - "sum": "APXOIP3B3dZ3Tyh7L2UhyWR8Vbf5+9adTLz/ya7n6uU=", + "version": "789c9b1a5e455850ed9b3c89cafb37df75ce1e50", + "sum": "dYLcLzGH4yF3qB7OGC/7z4nqeTNjv42L7Q3BENU8XJI=", "name": "prometheus" }, { "source": { "git": { "remote": "https://github.com/pyrra-dev/pyrra.git", - "subdir": "config/crd/bases" + "subdir": "jsonnet/controller-gen" } }, - "version": "3a58b4ed4f649c9502b8dc3e6bfc5b4159e33b48", - "sum": "GQ0GFKGdIWKx1b78VRs6jtC4SMqkBjT5jl65QUjPKK4=" + "version": "d723f4d1a066dd657e9d09c46a158519dda0faa8", + "sum": "cxAPQovFkM16zNB5/94O+sk/n3SETk6ao6Oas2Sa6RE=", + "name": "pyrra" }, { "source": { @@ -385,7 +426,7 @@ "subdir": "jsonnet/lib" } }, - "version": "969b895fe8d20cfc03b3e730c4625727752578a9", + "version": "49eea20aad078ef71283151bab24fc98582f02b8", "sum": "X4ruN0pkmokEkDCWYSai925MFpueJsgArTaHo93N6MU=" }, { @@ -395,8 +436,8 @@ "subdir": "jsonnet/kube-thanos" } }, - "version": "ac261330bb819523d2caba81b4e82add166436c7", - "sum": "sN9PqW93Kh+jyf5kvRKQgotY3xbRIU/h22TcNO2KTag=" + "version": "a28d8ac336e4d7ca4309237fa79d4116e5364d7e", + "sum": "6WL0iZD2b4OkWVni3ppQ4n5TP3SsJFBYddacf2YU4gk=" }, { "source": { @@ -405,8 +446,8 @@ "subdir": "mixin" } }, - "version": "5d695e9226e4360c450a2dbf3076f79835829dd1", - "sum": "WhheqsiX0maUXByZFsb9xhCEsGXK2955bPmPPf1x+Cs=", + "version": "f9da21ec0b28073875520159fe72ab744c255b2e", + "sum": "ieCD4eMgGbOlrI8GmckGPHBGQDcLasE1rULYq56W/bs=", "name": "thanos-mixin" } ], diff --git a/resources/observability/grafana/observatorium-logs/grafana-dashboards-template.yaml b/resources/observability/grafana/observatorium-logs/grafana-dashboards-template.yaml index 9a3cc623cc..2f8434d01d 100644 --- a/resources/observability/grafana/observatorium-logs/grafana-dashboards-template.yaml +++ b/resources/observability/grafana/observatorium-logs/grafana-dashboards-template.yaml @@ -2152,10 +2152,8 @@ objects: { "expr": "sum(loki_ingester_memory_chunks{job=\"observatorium-loki-ingester\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "series", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2165,8 +2163,8 @@ objects: "timeShift": null, "title": "Series", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2238,10 +2236,8 @@ objects: { "expr": "sum(loki_ingester_memory_chunks{job=\"observatorium-loki-ingester\"}) / sum(loki_ingester_memory_streams{job=\"observatorium-loki-ingester\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "chunks", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2251,8 +2247,8 @@ objects: "timeShift": null, "title": "Chunks per series", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2336,26 +2332,20 @@ objects: { "expr": "histogram_quantile(0.99, sum(rate(loki_ingester_chunk_utilization_bucket{job=\"observatorium-loki-ingester\"}[$__rate_interval])) by (le)) * 1", "format": "time_series", - "intervalFactor": 2, "legendFormat": "99th Percentile", - "refId": "A", - "step": 10 + "refId": "A" }, { "expr": "histogram_quantile(0.50, sum(rate(loki_ingester_chunk_utilization_bucket{job=\"observatorium-loki-ingester\"}[$__rate_interval])) by (le)) * 1", "format": "time_series", - "intervalFactor": 2, "legendFormat": "50th Percentile", - "refId": "B", - "step": 10 + "refId": "B" }, { "expr": "sum(rate(loki_ingester_chunk_utilization_sum{job=\"observatorium-loki-ingester\"}[$__rate_interval])) * 1 / sum(rate(loki_ingester_chunk_utilization_count{job=\"observatorium-loki-ingester\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Average", - "refId": "C", - "step": 10 + "refId": "C" } ], "thresholds": [ @@ -2365,8 +2355,8 @@ objects: "timeShift": null, "title": "Utilization", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2438,26 +2428,20 @@ objects: { "expr": "histogram_quantile(0.99, sum(rate(loki_ingester_chunk_age_seconds_bucket{job=\"observatorium-loki-ingester\"}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", - "intervalFactor": 2, "legendFormat": "99th Percentile", - "refId": "A", - "step": 10 + "refId": "A" }, { "expr": "histogram_quantile(0.50, sum(rate(loki_ingester_chunk_age_seconds_bucket{job=\"observatorium-loki-ingester\"}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", - "intervalFactor": 2, "legendFormat": "50th Percentile", - "refId": "B", - "step": 10 + "refId": "B" }, { "expr": "sum(rate(loki_ingester_chunk_age_seconds_sum{job=\"observatorium-loki-ingester\"}[$__rate_interval])) * 1e3 / sum(rate(loki_ingester_chunk_age_seconds_count{job=\"observatorium-loki-ingester\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Average", - "refId": "C", - "step": 10 + "refId": "C" } ], "thresholds": [ @@ -2467,8 +2451,8 @@ objects: "timeShift": null, "title": "Age", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2552,26 +2536,20 @@ objects: { "expr": "histogram_quantile(0.99, sum(rate(loki_ingester_chunk_entries_bucket{job=\"observatorium-loki-ingester\"}[$__rate_interval])) by (le)) * 1", "format": "time_series", - "intervalFactor": 2, "legendFormat": "99th Percentile", - "refId": "A", - "step": 10 + "refId": "A" }, { "expr": "histogram_quantile(0.50, sum(rate(loki_ingester_chunk_entries_bucket{job=\"observatorium-loki-ingester\"}[$__rate_interval])) by (le)) * 1", "format": "time_series", - "intervalFactor": 2, "legendFormat": "50th Percentile", - "refId": "B", - "step": 10 + "refId": "B" }, { "expr": "sum(rate(loki_ingester_chunk_entries_sum{job=\"observatorium-loki-ingester\"}[$__rate_interval])) * 1 / sum(rate(loki_ingester_chunk_entries_count{job=\"observatorium-loki-ingester\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Average", - "refId": "C", - "step": 10 + "refId": "C" } ], "thresholds": [ @@ -2581,8 +2559,8 @@ objects: "timeShift": null, "title": "Log Entries Per Chunk", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2654,10 +2632,8 @@ objects: { "expr": "sum(rate(loki_chunk_store_index_entries_per_chunk_sum{job=\"observatorium-loki-ingester\"}[5m])) / sum(rate(loki_chunk_store_index_entries_per_chunk_count{job=\"observatorium-loki-ingester\"}[5m]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Index Entries", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2667,8 +2643,8 @@ objects: "timeShift": null, "title": "Index Entries Per Chunk", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2752,10 +2728,8 @@ objects: { "expr": "cortex_ingester_flush_queue_length{job=\"observatorium-loki-ingester\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2765,8 +2739,8 @@ objects: "timeShift": null, "title": "Queue Length", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2805,6 +2779,8 @@ objects: "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -2842,12 +2818,10 @@ objects: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_ingester_chunk_age_seconds_count{job=\"observatorium-loki-ingester\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_ingester_chunk_age_seconds_count{job=\"observatorium-loki-ingester\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -2857,8 +2831,8 @@ objects: "timeShift": null, "title": "Flush Rate", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2942,10 +2916,8 @@ objects: { "expr": "sum(rate(loki_ingester_chunks_flushed_total{job=\"observatorium-loki-ingester\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2955,8 +2927,8 @@ objects: "timeShift": null, "title": "Chunks Flushed/Second", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -3028,10 +3000,8 @@ objects: { "expr": "sum by (reason) (rate(loki_ingester_chunks_flushed_total{job=\"observatorium-loki-ingester\"}[$__rate_interval])) / ignoring(reason) group_left sum(rate(loki_ingester_chunks_flushed_total{job=\"observatorium-loki-ingester\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{reason}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -3041,8 +3011,8 @@ objects: "timeShift": null, "title": "Chunk Flush Reason", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -3254,26 +3224,20 @@ objects: { "expr": "histogram_quantile(0.99, sum(rate(loki_ingester_chunk_size_bytes_bucket{job=\"observatorium-loki-ingester\"}[1m])) by (le))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "p99", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "histogram_quantile(0.90, sum(rate(loki_ingester_chunk_size_bytes_bucket{job=\"observatorium-loki-ingester\"}[1m])) by (le))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "p90", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "histogram_quantile(0.50, sum(rate(loki_ingester_chunk_size_bytes_bucket{job=\"observatorium-loki-ingester\"}[1m])) by (le))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "p50", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -3283,8 +3247,8 @@ objects: "timeShift": null, "title": "Chunk Size Quantiles", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -3368,26 +3332,20 @@ objects: { "expr": "histogram_quantile(0.5, sum(rate(loki_ingester_chunk_bounds_hours_bucket{job=\"observatorium-loki-ingester\"}[5m])) by (le))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "p50", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "histogram_quantile(0.99, sum(rate(loki_ingester_chunk_bounds_hours_bucket{job=\"observatorium-loki-ingester\"}[5m])) by (le))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "p99", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum(rate(loki_ingester_chunk_bounds_hours_sum{job=\"observatorium-loki-ingester\"}[5m])) / sum(rate(loki_ingester_chunk_bounds_hours_count{job=\"observatorium-loki-ingester\"}[5m]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "avg", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -3397,8 +3355,8 @@ objects: "timeShift": null, "title": "Chunk Duration hours (end-start)", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -3455,7 +3413,7 @@ objects: "value": "${OBSERVATORIUM_API_DATASOURCE}" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ @@ -8750,7 +8708,7 @@ objects: "value": "${OBSERVATORIUM_API_DATASOURCE}" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ @@ -10891,6 +10849,8 @@ objects: "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -10928,12 +10888,10 @@ objects: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{job=\"observatorium-loki-query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{job=\"observatorium-loki-query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -10943,8 +10901,8 @@ objects: "timeShift": null, "title": "QPS", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -11016,26 +10974,20 @@ objects: { "expr": "histogram_quantile(0.99, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"observatorium-loki-query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ route }} 99th Percentile", - "refId": "A", - "step": 10 + "legendFormat": "{{ route }} 99th percentile", + "refId": "A" }, { "expr": "histogram_quantile(0.50, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"observatorium-loki-query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ route }} 50th Percentile", - "refId": "B", - "step": 10 + "legendFormat": "{{ route }} 50th percentile", + "refId": "B" }, { "expr": "1e3 * sum(job_route:loki_request_duration_seconds_sum:sum_rate{job=\"observatorium-loki-query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) / sum(job_route:loki_request_duration_seconds_count:sum_rate{job=\"observatorium-loki-query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) ", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{ route }} Average", - "refId": "C", - "step": 10 + "refId": "C" } ], "thresholds": [ @@ -11045,8 +10997,8 @@ objects: "timeShift": null, "title": "Latency", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -11097,6 +11049,8 @@ objects: "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -11134,12 +11088,10 @@ objects: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{job=\"observatorium-loki-querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{job=\"observatorium-loki-querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -11149,8 +11101,8 @@ objects: "timeShift": null, "title": "QPS", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -11222,26 +11174,20 @@ objects: { "expr": "histogram_quantile(0.99, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"observatorium-loki-querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ route }} 99th Percentile", - "refId": "A", - "step": 10 + "legendFormat": "{{ route }} 99th percentile", + "refId": "A" }, { "expr": "histogram_quantile(0.50, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"observatorium-loki-querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ route }} 50th Percentile", - "refId": "B", - "step": 10 + "legendFormat": "{{ route }} 50th percentile", + "refId": "B" }, { "expr": "1e3 * sum(job_route:loki_request_duration_seconds_sum:sum_rate{job=\"observatorium-loki-querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) / sum(job_route:loki_request_duration_seconds_count:sum_rate{job=\"observatorium-loki-querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) ", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{ route }} Average", - "refId": "C", - "step": 10 + "refId": "C" } ], "thresholds": [ @@ -11251,8 +11197,8 @@ objects: "timeShift": null, "title": "Latency", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -11303,6 +11249,8 @@ objects: "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -11340,12 +11288,10 @@ objects: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{job=\"observatorium-loki-ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{job=\"observatorium-loki-ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -11355,8 +11301,8 @@ objects: "timeShift": null, "title": "QPS", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -11428,26 +11374,20 @@ objects: { "expr": "histogram_quantile(0.99, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"observatorium-loki-ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ route }} 99th Percentile", - "refId": "A", - "step": 10 + "legendFormat": "{{ route }} 99th percentile", + "refId": "A" }, { "expr": "histogram_quantile(0.50, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"observatorium-loki-ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ route }} 50th Percentile", - "refId": "B", - "step": 10 + "legendFormat": "{{ route }} 50th percentile", + "refId": "B" }, { "expr": "1e3 * sum(job_route:loki_request_duration_seconds_sum:sum_rate{job=\"observatorium-loki-ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) / sum(job_route:loki_request_duration_seconds_count:sum_rate{job=\"observatorium-loki-ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) ", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{ route }} Average", - "refId": "C", - "step": 10 + "refId": "C" } ], "thresholds": [ @@ -11457,8 +11397,8 @@ objects: "timeShift": null, "title": "Latency", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -11509,6 +11449,8 @@ objects: "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -11546,12 +11488,10 @@ objects: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_index_request_duration_seconds_count{job=\"observatorium-loki-querier\", operation!=\"index_chunk\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_index_request_duration_seconds_count{job=\"observatorium-loki-querier\", operation!=\"index_chunk\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -11561,8 +11501,8 @@ objects: "timeShift": null, "title": "QPS", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -11634,26 +11574,20 @@ objects: { "expr": "histogram_quantile(0.99, sum(rate(loki_index_request_duration_seconds_bucket{job=\"observatorium-loki-querier\", operation!=\"index_chunk\"}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", - "intervalFactor": 2, "legendFormat": "99th Percentile", - "refId": "A", - "step": 10 + "refId": "A" }, { "expr": "histogram_quantile(0.50, sum(rate(loki_index_request_duration_seconds_bucket{job=\"observatorium-loki-querier\", operation!=\"index_chunk\"}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", - "intervalFactor": 2, "legendFormat": "50th Percentile", - "refId": "B", - "step": 10 + "refId": "B" }, { "expr": "sum(rate(loki_index_request_duration_seconds_sum{job=\"observatorium-loki-querier\", operation!=\"index_chunk\"}[$__rate_interval])) * 1e3 / sum(rate(loki_index_request_duration_seconds_count{job=\"observatorium-loki-querier\", operation!=\"index_chunk\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Average", - "refId": "C", - "step": 10 + "refId": "C" } ], "thresholds": [ @@ -11663,8 +11597,8 @@ objects: "timeShift": null, "title": "Latency", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -11715,6 +11649,8 @@ objects: "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -11752,12 +11688,10 @@ objects: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_boltdb_shipper_request_duration_seconds_count{ operation=\"Shipper.Query\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_boltdb_shipper_request_duration_seconds_count{ operation=\"Shipper.Query\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -11767,8 +11701,8 @@ objects: "timeShift": null, "title": "QPS", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -11840,26 +11774,20 @@ objects: { "expr": "histogram_quantile(0.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{ operation=\"Shipper.Query\"}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", - "intervalFactor": 2, "legendFormat": "99th Percentile", - "refId": "A", - "step": 10 + "refId": "A" }, { "expr": "histogram_quantile(0.50, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{ operation=\"Shipper.Query\"}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", - "intervalFactor": 2, "legendFormat": "50th Percentile", - "refId": "B", - "step": 10 + "refId": "B" }, { "expr": "sum(rate(loki_boltdb_shipper_request_duration_seconds_sum{ operation=\"Shipper.Query\"}[$__rate_interval])) * 1e3 / sum(rate(loki_boltdb_shipper_request_duration_seconds_count{ operation=\"Shipper.Query\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Average", - "refId": "C", - "step": 10 + "refId": "C" } ], "thresholds": [ @@ -11869,8 +11797,8 @@ objects: "timeShift": null, "title": "Latency", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -11927,7 +11855,7 @@ objects: "value": "${OBSERVATORIUM_API_DATASOURCE}" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ @@ -12097,26 +12025,20 @@ objects: { "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{ namespace=~\"$namespace\", container=\"observatorium-loki-compactor\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(kube_pod_container_resource_requests{ namespace=~\"$namespace\", container=\"observatorium-loki-compactor\", resource=\"cpu\"} > 0)", "format": "time_series", - "intervalFactor": 2, "legendFormat": "request", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(container_spec_cpu_quota{ namespace=~\"$namespace\", container=\"observatorium-loki-compactor\"} / container_spec_cpu_period{ namespace=~\"$namespace\", container=\"observatorium-loki-compactor\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -12206,26 +12128,20 @@ objects: { "expr": "max by(pod) (container_memory_working_set_bytes{ namespace=~\"$namespace\", container=\"observatorium-loki-compactor\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(kube_pod_container_resource_requests{ namespace=~\"$namespace\", container=\"observatorium-loki-compactor\", resource=\"memory\"} > 0)", "format": "time_series", - "intervalFactor": 2, "legendFormat": "request", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(container_spec_memory_limit_bytes{ namespace=~\"$namespace\", container=\"observatorium-loki-compactor\"} > 0)", "format": "time_series", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -12306,10 +12222,8 @@ objects: { "expr": "sum by(pod) (go_memstats_heap_inuse_bytes{ job=\"observatorium-loki-compactor\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -12451,8 +12365,8 @@ objects: "timeShift": null, "title": "Last Compact and Mark Operation Success", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "stat", @@ -12524,10 +12438,8 @@ objects: { "expr": "loki_boltdb_shipper_compact_tables_operation_duration_seconds{ namespace=~\"$namespace\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "duration", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -12537,8 +12449,8 @@ objects: "timeShift": null, "title": "Compact and Mark Operations Duration", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -12610,10 +12522,8 @@ objects: { "expr": "sum by (status)(rate(loki_boltdb_shipper_compact_tables_operation_total{ namespace=~\"$namespace\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{success}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -12623,8 +12533,8 @@ objects: "timeShift": null, "title": "Compact and Mark Operations Per Status", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -12708,10 +12618,8 @@ objects: { "expr": "count by(action)(loki_boltdb_shipper_retention_marker_table_processed_total{ namespace=~\"$namespace\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{action}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -12721,8 +12629,8 @@ objects: "timeShift": null, "title": "Processed Tables Per Action", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -12794,10 +12702,8 @@ objects: { "expr": "count by(table,action)(loki_boltdb_shipper_retention_marker_table_processed_total{ namespace=~\"$namespace\" , action=~\"modified|deleted\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{table}}-{{action}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -12807,8 +12713,8 @@ objects: "timeShift": null, "title": "Modified Tables", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -12880,10 +12786,8 @@ objects: { "expr": "sum by (table)(rate(loki_boltdb_shipper_retention_marker_count_total{ namespace=~\"$namespace\"}[$__rate_interval])) >0", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{table}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -12893,8 +12797,8 @@ objects: "timeShift": null, "title": "Marks Creation Rate Per Table", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -12980,7 +12884,6 @@ objects: "expr": "sum (increase(loki_boltdb_shipper_retention_marker_count_total{ namespace=~\"$namespace\"}[24h]))", "format": "time_series", "instant": true, - "intervalFactor": 2, "refId": "A" } ], @@ -12989,8 +12892,8 @@ objects: "timeShift": null, "title": "Marked Chunks (24h)", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "singlestat", @@ -13062,26 +12965,20 @@ objects: { "expr": "histogram_quantile(0.99, sum(rate(loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_bucket{ namespace=~\"$namespace\"}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", - "intervalFactor": 2, "legendFormat": "99th Percentile", - "refId": "A", - "step": 10 + "refId": "A" }, { "expr": "histogram_quantile(0.50, sum(rate(loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_bucket{ namespace=~\"$namespace\"}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", - "intervalFactor": 2, "legendFormat": "50th Percentile", - "refId": "B", - "step": 10 + "refId": "B" }, { "expr": "sum(rate(loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_sum{ namespace=~\"$namespace\"}[$__rate_interval])) * 1e3 / sum(rate(loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_count{ namespace=~\"$namespace\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Average", - "refId": "C", - "step": 10 + "refId": "C" } ], "thresholds": [ @@ -13091,8 +12988,8 @@ objects: "timeShift": null, "title": "Mark Table Latency", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -13178,7 +13075,6 @@ objects: "expr": "sum (increase(loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_count{ namespace=~\"$namespace\"}[24h]))", "format": "time_series", "instant": true, - "intervalFactor": 2, "refId": "A" } ], @@ -13187,8 +13083,8 @@ objects: "timeShift": null, "title": "Delete Chunks (24h)", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "singlestat", @@ -13260,26 +13156,20 @@ objects: { "expr": "histogram_quantile(0.99, sum(rate(loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_bucket{ namespace=~\"$namespace\"}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", - "intervalFactor": 2, "legendFormat": "99th Percentile", - "refId": "A", - "step": 10 + "refId": "A" }, { "expr": "histogram_quantile(0.50, sum(rate(loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_bucket{ namespace=~\"$namespace\"}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", - "intervalFactor": 2, "legendFormat": "50th Percentile", - "refId": "B", - "step": 10 + "refId": "B" }, { "expr": "sum(rate(loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_sum{ namespace=~\"$namespace\"}[$__rate_interval])) * 1e3 / sum(rate(loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_count{ namespace=~\"$namespace\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Average", - "refId": "C", - "step": 10 + "refId": "C" } ], "thresholds": [ @@ -13289,8 +13179,8 @@ objects: "timeShift": null, "title": "Delete Latency", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -13374,10 +13264,8 @@ objects: { "expr": "time() - (loki_boltdb_shipper_retention_sweeper_marker_file_processing_current_time{ namespace=~\"$namespace\"} > 0)", "format": "time_series", - "intervalFactor": 2, "legendFormat": "lag", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -13387,8 +13275,8 @@ objects: "timeShift": null, "title": "Sweeper Lag", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -13460,10 +13348,8 @@ objects: { "expr": "sum(loki_boltdb_shipper_retention_sweeper_marker_files_current{ namespace=~\"$namespace\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "count", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -13473,8 +13359,8 @@ objects: "timeShift": null, "title": "Marks Files to Process", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -13546,10 +13432,8 @@ objects: { "expr": "sum by (status)(rate(loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_count{ namespace=~\"$namespace\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{status}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -13559,8 +13443,8 @@ objects: "timeShift": null, "title": "Delete Rate Per Status", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -13617,7 +13501,7 @@ objects: "value": "${OBSERVATORIUM_API_DATASOURCE}" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ @@ -13757,6 +13641,8 @@ objects: "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -13794,12 +13680,10 @@ objects: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{job=\"observatorium-loki-distributor\", route=~\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{job=\"observatorium-loki-distributor\", route=~\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -13809,8 +13693,8 @@ objects: "timeShift": null, "title": "QPS", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -13882,26 +13766,20 @@ objects: { "expr": "histogram_quantile(0.99, sum by (le) (job:loki_request_duration_seconds_bucket:sum_rate{job=\"observatorium-loki-distributor\"})) * 1e3", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99th Percentile", - "refId": "A", - "step": 10 + "legendFormat": "99th percentile", + "refId": "A" }, { "expr": "histogram_quantile(0.50, sum by (le) (job:loki_request_duration_seconds_bucket:sum_rate{job=\"observatorium-loki-distributor\"})) * 1e3", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "50th Percentile", - "refId": "B", - "step": 10 + "legendFormat": "50th percentile", + "refId": "B" }, { "expr": "1e3 * sum(job:loki_request_duration_seconds_sum:sum_rate{job=\"observatorium-loki-distributor\"}) / sum(job:loki_request_duration_seconds_count:sum_rate{job=\"observatorium-loki-distributor\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Average", - "refId": "C", - "step": 10 + "refId": "C" } ], "thresholds": [ @@ -13911,8 +13789,8 @@ objects: "timeShift": null, "title": "Latency", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -13963,6 +13841,8 @@ objects: "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -14000,12 +13880,10 @@ objects: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{job=\"observatorium-loki-ingester\", route=\"/logproto.Pusher/Push\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{job=\"observatorium-loki-ingester\", route=\"/logproto.Pusher/Push\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -14015,8 +13893,8 @@ objects: "timeShift": null, "title": "QPS", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -14088,26 +13966,20 @@ objects: { "expr": "histogram_quantile(0.99, sum by (le) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"observatorium-loki-ingester\", route=\"/logproto.Pusher/Push\"})) * 1e3", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99th Percentile", - "refId": "A", - "step": 10 + "legendFormat": "99th percentile", + "refId": "A" }, { "expr": "histogram_quantile(0.50, sum by (le) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"observatorium-loki-ingester\", route=\"/logproto.Pusher/Push\"})) * 1e3", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "50th Percentile", - "refId": "B", - "step": 10 + "legendFormat": "50th percentile", + "refId": "B" }, { "expr": "1e3 * sum(job_route:loki_request_duration_seconds_sum:sum_rate{job=\"observatorium-loki-ingester\", route=\"/logproto.Pusher/Push\"}) / sum(job_route:loki_request_duration_seconds_count:sum_rate{job=\"observatorium-loki-ingester\", route=\"/logproto.Pusher/Push\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Average", - "refId": "C", - "step": 10 + "refId": "C" } ], "thresholds": [ @@ -14117,8 +13989,8 @@ objects: "timeShift": null, "title": "Latency", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -14169,6 +14041,8 @@ objects: "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -14206,12 +14080,10 @@ objects: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_index_request_duration_seconds_count{job=\"observatorium-loki-ingester\", operation=\"index_chunk\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_index_request_duration_seconds_count{job=\"observatorium-loki-ingester\", operation=\"index_chunk\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -14221,8 +14093,8 @@ objects: "timeShift": null, "title": "QPS", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -14294,26 +14166,20 @@ objects: { "expr": "histogram_quantile(0.99, sum(rate(loki_index_request_duration_seconds_bucket{job=\"observatorium-loki-ingester\", operation=\"index_chunk\"}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", - "intervalFactor": 2, "legendFormat": "99th Percentile", - "refId": "A", - "step": 10 + "refId": "A" }, { "expr": "histogram_quantile(0.50, sum(rate(loki_index_request_duration_seconds_bucket{job=\"observatorium-loki-ingester\", operation=\"index_chunk\"}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", - "intervalFactor": 2, "legendFormat": "50th Percentile", - "refId": "B", - "step": 10 + "refId": "B" }, { "expr": "sum(rate(loki_index_request_duration_seconds_sum{job=\"observatorium-loki-ingester\", operation=\"index_chunk\"}[$__rate_interval])) * 1e3 / sum(rate(loki_index_request_duration_seconds_count{job=\"observatorium-loki-ingester\", operation=\"index_chunk\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Average", - "refId": "C", - "step": 10 + "refId": "C" } ], "thresholds": [ @@ -14323,8 +14189,8 @@ objects: "timeShift": null, "title": "Latency", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -14375,6 +14241,8 @@ objects: "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -14412,12 +14280,10 @@ objects: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_boltdb_shipper_request_duration_seconds_count{job=\"observatorium-loki-ingester\", operation=\"WRITE\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_boltdb_shipper_request_duration_seconds_count{job=\"observatorium-loki-ingester\", operation=\"WRITE\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -14427,8 +14293,8 @@ objects: "timeShift": null, "title": "QPS", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -14500,26 +14366,20 @@ objects: { "expr": "histogram_quantile(0.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{job=\"observatorium-loki-ingester\", operation=\"WRITE\"}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", - "intervalFactor": 2, "legendFormat": "99th Percentile", - "refId": "A", - "step": 10 + "refId": "A" }, { "expr": "histogram_quantile(0.50, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{job=\"observatorium-loki-ingester\", operation=\"WRITE\"}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", - "intervalFactor": 2, "legendFormat": "50th Percentile", - "refId": "B", - "step": 10 + "refId": "B" }, { "expr": "sum(rate(loki_boltdb_shipper_request_duration_seconds_sum{job=\"observatorium-loki-ingester\", operation=\"WRITE\"}[$__rate_interval])) * 1e3 / sum(rate(loki_boltdb_shipper_request_duration_seconds_count{job=\"observatorium-loki-ingester\", operation=\"WRITE\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Average", - "refId": "C", - "step": 10 + "refId": "C" } ], "thresholds": [ @@ -14529,8 +14389,8 @@ objects: "timeShift": null, "title": "Latency", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -14587,7 +14447,7 @@ objects: "value": "${OBSERVATORIUM_API_DATASOURCE}" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-alertmanager-overview.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-alertmanager-overview.configmap.yaml index 4bdfc0ed0f..fc2a240718 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-alertmanager-overview.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-alertmanager-overview.configmap.yaml @@ -2,454 +2,270 @@ apiVersion: v1 data: alertmanager-overview.json: |- { - "__inputs": [ - - ], - "__requires": [ - - ], - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, "graphTooltip": 1, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "30s", - "rows": [ + "panels": [ { - "collapse": false, "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, "panels": [ - { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 2, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": false, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(alertmanager_alerts{namespace=~\"$namespace\",job=~\"$job\"}) by (namespace,job,pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod}}", - "refId": "A" + ], + "title": "Alerts", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "current set of alerts stored in the Alertmanager", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never", + "stacking": { + "mode": "normal" } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Alerts", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] }, - "yaxes": [ - { - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "legend": { + "showLegend": false }, + "tooltip": { + "mode": "multi" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 3, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": false, - "sideWidth": null, - "total": false, - "values": false + "datasource": { + "type": "prometheus", + "uid": "$datasource" }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(alertmanager_alerts_received_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod}} Received", - "refId": "A" - }, - { - "expr": "sum(rate(alertmanager_alerts_invalid_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod}} Invalid", - "refId": "B" + "expr": "sum(alertmanager_alerts{namespace=~\"$namespace\",job=~\"$job\"}) by (namespace,job,pod)", + "intervalFactor": 2, + "legendFormat": "{{pod}}" + } + ], + "title": "Alerts", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "rate of successful and invalid alerts received by the Alertmanager", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never", + "stacking": { + "mode": "normal" } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Alerts receive rate", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] + "unit": "ops" + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 3, + "options": { + "legend": { + "showLegend": false + }, + "tooltip": { + "mode": "multi" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "expr": "sum(rate(alertmanager_alerts_received_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod)", + "intervalFactor": 2, + "legendFormat": "{{pod}} Received" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(rate(alertmanager_alerts_invalid_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod)", + "intervalFactor": 2, + "legendFormat": "{{pod}} Invalid" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Alerts", - "titleSize": "h6", - "type": "row" + "title": "Alerts receive rate", + "type": "timeseries" }, { - "collapse": false, "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 4, "panels": [ - { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 4, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": false, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": "integration", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(alertmanager_notifications_total{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (integration,namespace,job,pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod}} Total", - "refId": "A" - }, - { - "expr": "sum(rate(alertmanager_notifications_failed_total{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (integration,namespace,job,pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod}} Failed", - "refId": "B" + ], + "title": "Notifications", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "rate of successful and invalid notifications sent by the Alertmanager", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never", + "stacking": { + "mode": "normal" } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "$integration: Notifications Send Rate", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] + "unit": "ops" + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 5, + "options": { + "legend": { + "showLegend": false + }, + "tooltip": { + "mode": "multi" + } + }, + "pluginVersion": "v11.1.0", + "repeat": "integration", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "expr": "sum(rate(alertmanager_notifications_total{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (integration,namespace,job,pod)", + "intervalFactor": 2, + "legendFormat": "{{pod}} Total" }, { - "aliasColors": { - + "datasource": { + "type": "prometheus", + "uid": "$datasource" }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - + "expr": "sum(rate(alertmanager_notifications_failed_total{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (integration,namespace,job,pod)", + "intervalFactor": 2, + "legendFormat": "{{pod}} Failed" + } + ], + "title": "$integration: Notifications Send Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "latency of notifications sent by the Alertmanager", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never", + "stacking": { + "mode": "normal" + } }, - "id": 5, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": false, - "sideWidth": null, - "total": false, - "values": false + "unit": "s" + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 6, + "options": { + "legend": { + "showLegend": false + }, + "tooltip": { + "mode": "multi" + } + }, + "pluginVersion": "v11.1.0", + "repeat": "integration", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": "integration", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (le,namespace,job,pod)\n) \n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod}} 99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (le,namespace,job,pod)\n) \n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod}} Median", - "refId": "B" - }, - { - "expr": "sum(rate(alertmanager_notification_latency_seconds_sum{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (namespace,job,pod)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (namespace,job,pod)\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod}} Average", - "refId": "C" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "$integration: Notification Duration", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" + "expr": "histogram_quantile(0.99,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (le,namespace,job,pod)\n)\n", + "intervalFactor": 2, + "legendFormat": "{{pod}} 99th Percentile" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] + "expr": "histogram_quantile(0.50,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (le,namespace,job,pod)\n)\n", + "intervalFactor": 2, + "legendFormat": "{{pod}} Median" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "expr": "sum(rate(alertmanager_notification_latency_seconds_sum{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (namespace,job,pod)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (namespace,job,pod)\n", + "intervalFactor": 2, + "legendFormat": "{{pod}} Average" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Notifications", - "titleSize": "h6", - "type": "row" + "title": "$integration: Notification Duration", + "type": "timeseries" } ], - "schemaVersion": 14, - "style": "dark", + "schemaVersion": 39, "tags": [ "alertmanager-mixin", "observatorium" @@ -458,100 +274,69 @@ data: "list": [ { "current": { + "selected": false, "text": "Prometheus", "value": "Prometheus" }, "hide": 0, "label": "Data Source", "name": "datasource", - "options": [ - - ], "query": "prometheus", - "refresh": 1, - "regex": "", "type": "datasource" }, { - "allValue": null, "current": { + "selected": false, "text": "", "value": "" }, - "datasource": "$datasource", - "hide": 0, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "includeAll": false, "label": "namespace", - "multi": false, "name": "namespace", - "options": [ - - ], "query": "label_values(alertmanager_alerts, namespace)", "refresh": 2, - "regex": "", "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" }, { - "allValue": null, "current": { + "selected": false, "text": "", "value": "" }, - "datasource": "$datasource", - "hide": 0, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "includeAll": false, "label": "job", - "multi": false, "name": "job", - "options": [ - - ], "query": "label_values(alertmanager_alerts, job)", "refresh": 2, - "regex": "", "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" }, { - "allValue": null, "current": { - "text": "all", + "selected": false, + "text": "$__all", "value": "$__all" }, - "datasource": "$datasource", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "hide": 2, "includeAll": true, - "label": null, - "multi": false, "name": "integration", - "options": [ - - ], "query": "label_values(alertmanager_notifications_total{integration=~\"slack|pagerduty|email|webhook\"}, integration)", "refresh": 2, - "regex": "", "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" } ] }, @@ -561,33 +346,12 @@ data: }, "timepicker": { "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" + "30s" ] }, "timezone": "utc", "title": "Alertmanager / Overview", - "uid": "alertmanager-overview", - "version": 0 + "uid": "alertmanager-overview" } kind: ConfigMap metadata: diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-gubernator.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-gubernator.configmap.yaml index 5d64083e2d..69d337ec9e 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-gubernator.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-gubernator.configmap.yaml @@ -62,10 +62,8 @@ data: { "expr": "sum by (job, method) (rate(gubernator_grpc_request_counts{namespace=\"$namespace\", job=\"observatorium-gubernator\", method=~\".*/GetRateLimits\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -75,8 +73,8 @@ data: "timeShift": null, "title": "Requests", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -150,10 +148,8 @@ data: { "expr": "sum by (job, method) (rate(gubernator_grpc_request_counts{namespace=\"$namespace\", job=\"observatorium-gubernator\", method=~\".*/GetRateLimits\", status=\"failed\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{status}} {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -163,8 +159,8 @@ data: "timeShift": null, "title": "Errors", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -238,10 +234,8 @@ data: { "expr": "avg by(quantile, job) (gubernator_grpc_request_duration{namespace=\"$namespace\", job=\"observatorium-gubernator\", method=~\".*/GetRateLimits\"}) * 1000", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{quantile}}th percentile", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -251,8 +245,8 @@ data: "timeShift": null, "title": "Latencies", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -326,10 +320,8 @@ data: { "expr": "sum by(job) (rate(gubernator_over_limit_counter{namespace=\"$namespace\", job=\"observatorium-gubernator\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -339,8 +331,8 @@ data: "timeShift": null, "title": "Over Limit requests rate", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -426,10 +418,8 @@ data: { "expr": "sum by (job, method) (rate(gubernator_grpc_request_counts{namespace=\"$namespace\", job=\"observatorium-gubernator\", method=~\".*/GetPeerRateLimits\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -439,8 +429,8 @@ data: "timeShift": null, "title": "Requests", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -514,10 +504,8 @@ data: { "expr": "sum by (job, method) (rate(gubernator_grpc_request_counts{namespace=\"$namespace\", job=\"observatorium-gubernator\", method=~\".*/GetPeerRateLimits\", status=\"failed\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{status}} {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -527,8 +515,8 @@ data: "timeShift": null, "title": "Errors", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -602,10 +590,8 @@ data: { "expr": "avg by(quantile, job) (gubernator_grpc_request_duration{namespace=\"$namespace\", job=\"observatorium-gubernator\", method=~\".*/GetPeerRateLimits\"}) * 1000", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{quantile}}th percentile", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -615,8 +601,8 @@ data: "timeShift": null, "title": "Latencies", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -702,10 +688,8 @@ data: { "expr": "sum by(job) (rate(gubernator_queue_length{namespace=\"$namespace\", job=\"observatorium-gubernator\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -715,8 +699,8 @@ data: "timeShift": null, "title": "getRateLimitsBatch queue length", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -790,10 +774,8 @@ data: { "expr": "sum by(job) (rate(gubernator_pool_queue_length{namespace=\"$namespace\", job=\"observatorium-gubernator\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -803,8 +785,8 @@ data: "timeShift": null, "title": "GetRateLimit queue length", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -890,10 +872,8 @@ data: { "expr": "sum by(job) (rate(gubernator_cache_access_count{namespace=\"$namespace\", job=\"observatorium-gubernator\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -903,8 +883,8 @@ data: "timeShift": null, "title": "Requests", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -978,10 +958,8 @@ data: { "expr": "sum by(job) (rate(gubernator_cache_access_count{namespace=\"$namespace\", job=\"observatorium-gubernator\", type=\"miss\"}[$interval])) / sum by(job) (rate(gubernator_cache_access_count{namespace=\"$namespace\", job=\"observatorium-gubernator\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -991,8 +969,8 @@ data: "timeShift": null, "title": "Misses", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1066,10 +1044,8 @@ data: { "expr": "sum by(job) (gubernator_cache_size{namespace=\"$namespace\", job=\"observatorium-gubernator\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1079,8 +1055,8 @@ data: "timeShift": null, "title": "Size", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1154,10 +1130,8 @@ data: { "expr": "sum by(job) (rate(gubernator_unexpired_evictions_count{namespace=\"$namespace\", job=\"observatorium-gubernator\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1167,8 +1141,8 @@ data: "timeShift": null, "title": "Unexpired evictions", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1254,10 +1228,8 @@ data: { "expr": "avg by(quantile, job) (gubernator_batch_send_duration{namespace=\"$namespace\", job=\"observatorium-gubernator\"}) * 1000", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{quantile}}th percentile", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1267,8 +1239,8 @@ data: "timeShift": null, "title": "Batch", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1342,10 +1314,8 @@ data: { "expr": "avg by(quantile, job) (gubernator_broadcast_durations{namespace=\"$namespace\", job=\"observatorium-gubernator\"}) * 1000", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{quantile}}th percentile", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1355,8 +1325,8 @@ data: "timeShift": null, "title": "Broadcast", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1430,10 +1400,8 @@ data: { "expr": "avg by(quantile, job) (gubernator_async_durations{namespace=\"$namespace\", job=\"observatorium-gubernator\"}) * 1000", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{quantile}}th percentile", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1443,8 +1411,8 @@ data: "timeShift": null, "title": "Async", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1530,10 +1498,8 @@ data: { "expr": "container_memory_working_set_bytes{container=\"gubernator\", pod=~\"observatorium-gubernator.*\", namespace=\"$namespace\"} / 1024^2", "format": "time_series", - "intervalFactor": 2, "legendFormat": "memory usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1543,8 +1509,8 @@ data: "timeShift": null, "title": "Memory Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1618,10 +1584,8 @@ data: { "expr": "rate(container_cpu_usage_seconds_total{container=\"gubernator\", pod=~\"observatorium-gubernator.*\", namespace=\"$namespace\"}[$interval]) * 100", "format": "time_series", - "intervalFactor": 2, "legendFormat": "cpu usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1631,8 +1595,8 @@ data: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1706,10 +1670,8 @@ data: { "expr": "sum by (pod) (kube_pod_container_status_restarts_total{container=\"gubernator\", pod=~\"observatorium-gubernator.*\", namespace=\"$namespace\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "pod restart count {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1719,8 +1681,8 @@ data: "timeShift": null, "title": "Pod/Container Restarts", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1794,18 +1756,14 @@ data: { "expr": "sum by (pod) (rate(container_network_receive_bytes_total{pod=~\"observatorium-gubernator.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic in {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum by (pod) (rate(container_network_transmit_bytes_total{pod=~\"observatorium-gubernator.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic out {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1815,8 +1773,8 @@ data: "timeShift": null, "title": "Network Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1870,7 +1828,7 @@ data: "value": "default" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-memcached-memcached-overview.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-memcached-memcached-overview.configmap.yaml index fd70936b6f..c24f112a24 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-memcached-memcached-overview.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-memcached-memcached-overview.configmap.yaml @@ -9,7 +9,7 @@ data: }, "editable": true, "gnetId": null, - "graphTooltip": 0, + "graphTooltip": 1, "hideControls": false, "links": [ @@ -53,17 +53,15 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(memcached_commands_total{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", command=\"get\", status=\"hit\"}[1m])) / sum(rate(memcached_commands_total{namespace=~\"$namespace\", job=~\"$job\", command=\"get\"}[1m]))", + "expr": "sum(rate(memcached_commands_total{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", command=\"get\", status=\"hit\"}[$__rate_interval])) / sum(rate(memcached_commands_total{namespace=~\"$namespace\", job=~\"$job\", command=\"get\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Hit Rate", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -73,8 +71,92 @@ data: "timeShift": null, "title": "Hit Rate", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(20,\n max by (namespace, job, instance) (\n memcached_current_connections{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"} / memcached_max_connections{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n))\n", + "format": "time_series", + "legendFormat": "{{ namespace }} / {{ job }} / {{ instance }}", + "legendLink": null + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Top 20 Highest Connection Usage", + "tooltip": { + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -127,7 +209,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 2, + "id": 3, "legend": { "avg": false, "current": false, @@ -156,12 +238,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum without (job, instance) (rate(memcached_commands_total{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[1m]))", + "expr": "sum by(command, status) (rate(memcached_commands_total{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{command}} {{status}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -171,8 +251,8 @@ data: "timeShift": null, "title": "Commands", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -213,7 +293,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -242,12 +322,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum without (job) (rate(memcached_items_evicted_total{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[1m]))", + "expr": "sum by(instance) (rate(memcached_items_evicted_total{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -257,8 +335,8 @@ data: "timeShift": null, "title": "Evictions", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -299,7 +377,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 4, + "id": 5, "legend": { "avg": false, "current": false, @@ -328,12 +406,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum without (job) (rate(memcached_items_total{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[1m]))", + "expr": "sum by(instance) (rate(memcached_items_total{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -343,8 +419,8 @@ data: "timeShift": null, "title": "Stored", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -397,7 +473,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 5, + "id": 6, "legend": { "avg": false, "current": false, @@ -421,17 +497,99 @@ data: ], "spaceLength": 10, - "span": 6, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance) (\n rate(memcached_process_user_cpu_seconds_total{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]) +\n rate(memcached_process_system_cpu_seconds_total{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n)\n", + "format": "time_series", + "legendFormat": "{{instance}}", + "legendLink": null + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum without (job) (memcached_current_bytes{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"})", + "expr": "sum by(instance) (memcached_current_bytes{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -441,8 +599,8 @@ data: "timeShift": null, "title": "Memory", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -483,7 +641,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 6, + "id": 8, "legend": { "avg": false, "current": false, @@ -507,17 +665,15 @@ data: ], "spaceLength": 10, - "span": 6, + "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum without (job) (memcached_current_items{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"})", + "expr": "sum by(instance) (memcached_current_items{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -527,8 +683,8 @@ data: "timeShift": null, "title": "Items", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -565,7 +721,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Memory", + "title": "Resources", "titleSize": "h6" }, { @@ -581,7 +737,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 7, + "id": 9, "legend": { "avg": false, "current": false, @@ -605,33 +761,105 @@ data: ], "spaceLength": 10, - "span": 4, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum without (job) (rate(memcached_connections_total{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[1m]))", + "expr": "sum by(instance) (memcached_current_connections{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"})", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - Connection Rate", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}}", + "legendLink": null }, { - "expr": "sum without (job) (memcached_current_connections{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"})", + "expr": "min(memcached_max_connections{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"})", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - Current Connrections", - "legendLink": null, - "step": 10 + "legendFormat": "Max Connections (min setting across all instances)", + "legendLink": null + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Connections", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true }, { - "expr": "sum without (job) (memcached_max_connections{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"})", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(instance) (rate(memcached_connections_total{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - Max Connections", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}}", + "legendLink": null } ], "thresholds": [ @@ -639,10 +867,10 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Connections", + "title": "Connections / sec", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -683,7 +911,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 8, + "id": 11, "legend": { "avg": false, "current": false, @@ -707,17 +935,15 @@ data: ], "spaceLength": 10, - "span": 4, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum without (job) (rate(memcached_read_bytes_total{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[1m]))", + "expr": "sum by(instance) (rate(memcached_read_bytes_total{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -725,10 +951,10 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Reads", + "title": "Bytes received", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -743,7 +969,7 @@ data: }, "yaxes": [ { - "format": "bps", + "format": "Bps", "label": null, "logBase": 1, "max": null, @@ -769,7 +995,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 9, + "id": 12, "legend": { "avg": false, "current": false, @@ -793,17 +1019,15 @@ data: ], "spaceLength": 10, - "span": 4, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum without (job) (rate(memcached_written_bytes_total{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[1m]))", + "expr": "sum by(instance) (rate(memcached_written_bytes_total{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -811,10 +1035,10 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Writes", + "title": "Bytes transmitted", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -829,7 +1053,7 @@ data: }, "yaxes": [ { - "format": "bps", + "format": "Bps", "label": null, "logBase": 1, "max": null, @@ -867,7 +1091,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 10, + "id": 13, "legend": { "avg": false, "current": false, @@ -1017,19 +1241,15 @@ data: "expr": "count by (job, instance, version) (memcached_version{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"})", "format": "table", "instant": true, - "intervalFactor": 2, "legendFormat": "", - "refId": "A", - "step": 10 + "refId": "A" }, { "expr": "max by (job, instance) (memcached_uptime_seconds{namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"})", "format": "table", "instant": true, - "intervalFactor": 2, "legendFormat": "", - "refId": "B", - "step": 10 + "refId": "B" } ], "thresholds": [ @@ -1039,8 +1259,8 @@ data: "timeShift": null, "title": "Memcached Info", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "transform": "table", @@ -1095,7 +1315,7 @@ data: "value": "default" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ @@ -1106,7 +1326,7 @@ data: "type": "datasource" }, { - "allValue": ".+", + "allValue": ".*", "current": { "selected": true, "text": "All", @@ -1222,7 +1442,7 @@ data: }, "timezone": "utc", "title": "Memcached Overview", - "uid": "", + "uid": "124d5222454213f748dbfaf69b77ec48", "version": 0 } kind: ConfigMap diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-compact.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-compact.configmap.yaml index b85a29c1b8..e294fe4c7a 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-compact.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-compact.configmap.yaml @@ -61,10 +61,8 @@ data: { "expr": "sum by (namespace, job, resolution) (rate(thanos_compact_group_compactions_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "compaction {{job}} {{resolution}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -75,7 +73,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -161,7 +159,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -246,10 +244,8 @@ data: { "expr": "sum by (namespace, job, resolution) (rate(thanos_compact_downsample_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "downsample {{job}} {{resolution}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -260,7 +256,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -346,7 +342,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -431,10 +427,8 @@ data: { "expr": "sum by (namespace, job) (rate(thanos_compact_garbage_collection_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "garbage collection {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -445,7 +439,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -531,7 +525,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -657,7 +651,7 @@ data: "title": "Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -742,10 +736,8 @@ data: { "expr": "sum by (namespace, job) (rate(thanos_compact_blocks_cleaned_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Blocks cleanup {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -756,7 +748,7 @@ data: "title": "Deletion Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -829,10 +821,8 @@ data: { "expr": "sum by (namespace, job) (rate(thanos_compact_block_cleanup_failures_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Blocks cleanup failures {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -843,7 +833,7 @@ data: "title": "Deletion Error Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -916,10 +906,8 @@ data: { "expr": "sum by (namespace, job) (rate(thanos_compact_blocks_marked_for_deletion_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Blocks marked {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -930,7 +918,7 @@ data: "title": "Marking Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1015,10 +1003,8 @@ data: { "expr": "sum by (namespace, job) (rate(thanos_blocks_meta_syncs_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "sync {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1029,7 +1015,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1115,7 +1101,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1241,7 +1227,7 @@ data: "title": "Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1326,10 +1312,8 @@ data: { "expr": "sum by (namespace, job, operation) (rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}} {{operation}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1340,7 +1324,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1426,7 +1410,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1552,7 +1536,7 @@ data: "title": "Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1636,50 +1620,38 @@ data: { "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc all {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\", job=~\"$job\"}[30s])", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc rate all {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}[30s])", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc rate heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "inuse heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "inuse stack {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1690,7 +1662,7 @@ data: "title": "Memory Used", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1762,10 +1734,8 @@ data: { "expr": "go_goroutines{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1776,7 +1746,7 @@ data: "title": "Goroutines", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1848,10 +1818,8 @@ data: { "expr": "go_gc_duration_seconds{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{quantile}} {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1862,7 +1830,7 @@ data: "title": "GC Time Quantiles", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1917,7 +1885,7 @@ data: "value": "default" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-overview.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-overview.configmap.yaml index 5ec174e851..dd3a74f5c8 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-overview.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-overview.configmap.yaml @@ -99,7 +99,7 @@ data: "title": "Requests Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -190,7 +190,7 @@ data: "title": "Requests Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -269,10 +269,8 @@ data: { "expr": "histogram_quantile(0.99, sum by (namespace, job, le) (rate(http_request_duration_seconds_bucket{namespace=\"$namespace\", handler=\"query\"}[$interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{namespace}} {{job}} P99", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -298,7 +296,7 @@ data: "title": "Latency 99th Percentile", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -421,7 +419,7 @@ data: "title": "Requests Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -512,7 +510,7 @@ data: "title": "Requests Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -591,10 +589,8 @@ data: { "expr": "histogram_quantile(0.99, sum by (namespace, job, le) (rate(http_request_duration_seconds_bucket{namespace=\"$namespace\", handler=\"query_range\"}[$interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{namespace}} {{job}} P99", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -620,7 +616,7 @@ data: "title": "Latency 99th Percentile", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -795,7 +791,7 @@ data: "title": "gRPC (Unary) Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -886,7 +882,7 @@ data: "title": "gRPC (Unary) Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -965,10 +961,8 @@ data: { "expr": "histogram_quantile(0.99, sum by (namespace, job, le) (rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\", grpc_type=\"unary\"}[$interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{namespace}} {{job}} P99", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -994,7 +988,7 @@ data: "title": "gRPC Latency 99th Percentile", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1117,7 +1111,7 @@ data: "title": "Incoming Requests Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1208,7 +1202,7 @@ data: "title": "Incoming Requests Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1287,10 +1281,8 @@ data: { "expr": "histogram_quantile(0.99, sum by (namespace, job, le) (rate(http_request_duration_seconds_bucket{namespace=\"$namespace\", handler=\"receive\"}[$interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{namespace}} {{job}} P99", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1316,7 +1308,7 @@ data: "title": "Incoming Requests Latency 99th Percentile", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1407,10 +1399,8 @@ data: { "expr": "sum by (namespace, job, alertmanager) (rate(thanos_alert_sender_alerts_sent_total{namespace=\"$namespace\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{alertmanager}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1421,7 +1411,7 @@ data: "title": "Alert Sent Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1513,7 +1503,7 @@ data: "title": "Alert Sent Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1592,10 +1582,8 @@ data: { "expr": "histogram_quantile(0.99, sum by (namespace, job, le) (rate(thanos_alert_sender_latency_seconds_bucket{namespace=\"$namespace\"}[$interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{namespace}} {{job}} P99", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1621,7 +1609,7 @@ data: "title": "Alert Sent Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1712,10 +1700,8 @@ data: { "expr": "sum by (namespace, job) (rate(thanos_compact_group_compactions_total{namespace=\"$namespace\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "compaction {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1726,7 +1712,7 @@ data: "title": "Compaction Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1818,7 +1804,7 @@ data: "title": "Compaction Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1873,7 +1859,7 @@ data: "value": "default" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-query-frontend.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-query-frontend.configmap.yaml index 3e0c5c15bf..70b35c49dd 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-query-frontend.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-query-frontend.configmap.yaml @@ -93,7 +93,7 @@ data: "title": "Rate of requests", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -198,7 +198,7 @@ data: "title": "Rate of queries", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -283,7 +283,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -409,7 +409,7 @@ data: "title": "Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -494,10 +494,8 @@ data: { "expr": "sum by (namespace, job, tripperware) (rate(cortex_cache_request_duration_seconds_count{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}} {{tripperware}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -508,7 +506,7 @@ data: "title": "Requests", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -581,18 +579,14 @@ data: { "expr": "sum by (namespace, job, tripperware) (rate(querier_cache_gets_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Cache gets - {{job}} {{tripperware}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum by (namespace, job, tripperware) (rate(querier_cache_misses_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "Cache misses - {{job}} {{tripperware}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -603,7 +597,7 @@ data: "title": "Querier cache gets vs misses", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -676,10 +670,8 @@ data: { "expr": "sum by (namespace, job, tripperware) (rate(cortex_cache_fetched_keys_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}} {{tripperware}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -690,7 +682,7 @@ data: "title": "Cortex fetched keys", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -763,10 +755,8 @@ data: { "expr": "sum by (namespace, job, tripperware) (rate(cortex_cache_hits_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}} {{tripperware}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -777,7 +767,7 @@ data: "title": "Cortex cache hits", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -861,50 +851,38 @@ data: { "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc all {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\", job=~\"$job\"}[30s])", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc rate all {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}[30s])", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc rate heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "inuse heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "inuse stack {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -915,7 +893,7 @@ data: "title": "Memory Used", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -987,10 +965,8 @@ data: { "expr": "go_goroutines{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1001,7 +977,7 @@ data: "title": "Goroutines", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1073,10 +1049,8 @@ data: { "expr": "go_gc_duration_seconds{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{quantile}} {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1087,7 +1061,7 @@ data: "title": "GC Time Quantiles", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1142,7 +1116,7 @@ data: "value": "default" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-query.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-query.configmap.yaml index fb6b1528a1..7d50b6f179 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-query.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-query.configmap.yaml @@ -93,7 +93,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -178,7 +178,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -304,7 +304,7 @@ data: "title": "Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -421,7 +421,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -506,7 +506,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -632,7 +632,7 @@ data: "title": "Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -801,7 +801,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -886,7 +886,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1012,7 +1012,7 @@ data: "title": "Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1181,7 +1181,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1266,7 +1266,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1392,7 +1392,7 @@ data: "title": "Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1477,10 +1477,8 @@ data: { "expr": "sum by (namespace, job) (rate(thanos_query_store_apis_dns_lookups_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "lookups {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1491,7 +1489,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1577,7 +1575,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1662,10 +1660,8 @@ data: { "expr": "max_over_time(thanos_query_concurrent_gate_queries_max{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}} - {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1676,7 +1672,7 @@ data: "title": "Concurrent Capacity", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1760,50 +1756,38 @@ data: { "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc all {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\", job=~\"$job\"}[30s])", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc rate all {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}[30s])", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc rate heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "inuse heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "inuse stack {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1814,7 +1798,7 @@ data: "title": "Memory Used", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1886,10 +1870,8 @@ data: { "expr": "go_goroutines{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1900,7 +1882,7 @@ data: "title": "Goroutines", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1972,10 +1954,8 @@ data: { "expr": "go_gc_duration_seconds{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{quantile}} {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1986,7 +1966,7 @@ data: "title": "GC Time Quantiles", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2041,7 +2021,7 @@ data: "value": "default" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-receive-controller.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-receive-controller.configmap.yaml index 5e6403a908..b7900f7542 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-receive-controller.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-receive-controller.configmap.yaml @@ -60,10 +60,8 @@ data: { "expr": "sum by (namespace, job) (rate(thanos_receive_controller_reconcile_attempts_total{namespace=\"$namespace\", job=\"$job\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "rate", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -74,7 +72,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -146,10 +144,8 @@ data: { "expr": "sum by (namespace, job, type) (rate(thanos_receive_controller_reconcile_errors_total{namespace=\"$namespace\", job=\"$job\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{type}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -160,7 +156,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -244,10 +240,8 @@ data: { "expr": "sum by (namespace, job) (rate(thanos_receive_controller_configmap_change_attempts_total{namespace=\"$namespace\", job=\"$job\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "rate", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -258,7 +252,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -330,10 +324,8 @@ data: { "expr": "sum by (namespace, job, type) (rate(thanos_receive_controller_configmap_change_errors_total{namespace=\"$namespace\", job=\"$job\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{type}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -344,7 +336,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -428,10 +420,8 @@ data: { "expr": "sum by (namespace, job) (rate(thanos_receive_hashrings_file_changes_total{job=~\"observatorium-thanos-receive-default.*\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "all", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -442,7 +432,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -528,7 +518,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -612,18 +602,14 @@ data: { "expr": "avg by (namespace, job, name) (thanos_receive_controller_hashring_nodes{namespace=\"$namespace\", job=\"$job\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "receive controller {{name}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "avg by (namespace, job, name) (thanos_receive_hashring_nodes{job=~\"observatorium-thanos-receive-default.*\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "receive {{name}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -634,7 +620,7 @@ data: "title": "Nodes per Hashring", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -706,18 +692,14 @@ data: { "expr": "avg by (namespace, job, name) (thanos_receive_controller_hashring_tenants{namespace=\"$namespace\", job=\"$job\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "receive controller {{name}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "avg by (namespace, job, name) (thanos_receive_hashring_tenants{job=~\"observatorium-thanos-receive-default.*\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "receive {{name}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -728,7 +710,7 @@ data: "title": "Tenants per Hashring", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -816,7 +798,6 @@ data: "expr": "time() - max by (namespace, job) (thanos_receive_controller_configmap_last_reload_success_timestamp_seconds{namespace=\"$namespace\", job=\"$job\"})", "format": "time_series", "instant": true, - "intervalFactor": 2, "refId": "A" } ], @@ -826,7 +807,7 @@ data: "title": "Last Updated", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "singlestat", @@ -902,7 +883,6 @@ data: "expr": "time() - max by (namespace, job) (thanos_receive_config_last_reload_success_timestamp_seconds{namespace=\"$namespace\", job=\"$job\"})", "format": "time_series", "instant": true, - "intervalFactor": 2, "refId": "A" } ], @@ -912,7 +892,7 @@ data: "title": "Last Updated", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "singlestat", @@ -967,7 +947,7 @@ data: "value": "default" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-receive.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-receive.configmap.yaml index b95a2efe19..ea6742f751 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-receive.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-receive.configmap.yaml @@ -93,7 +93,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -178,7 +178,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -304,7 +304,7 @@ data: "title": "Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -388,10 +388,8 @@ data: { "expr": "sum by (tenant, code) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{code}} - {{tenant}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -402,7 +400,7 @@ data: "title": "Rate of write requests (by tenant and code)", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -474,10 +472,8 @@ data: { "expr": "sum by (tenant, code) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{code}} - {{tenant}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -488,7 +484,7 @@ data: "title": "Number of errors (by tenant and code)", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -560,10 +556,8 @@ data: { "expr": "sum by (namespace, job, tenant) (rate(http_request_duration_seconds_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"}[$__rate_interval])) / sum by (namespace, job, tenant) (http_request_duration_seconds_count{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{tenant}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -574,7 +568,7 @@ data: "title": "Average request duration (by tenant)", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -658,10 +652,8 @@ data: { "expr": "sum by (namespace, job, tenant) (rate(http_request_size_bytes_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code=~\"2..\"}[$__rate_interval])) / sum by (namespace, job, tenant) (rate(http_request_size_bytes_count{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code=~\"2..\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{tenant}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -672,7 +664,7 @@ data: "title": "Average successful HTTP request size (per tenant and code, only 2XX)", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -744,10 +736,8 @@ data: { "expr": "sum by (namespace, job, tenant) (rate(http_request_size_bytes_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$__rate_interval])) / sum by (namespace, job, tenant) (rate(http_request_size_bytes_count{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{tenant}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -758,7 +748,7 @@ data: "title": "Average failed HTTP request size (per tenant and code, non 2XX)", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -830,10 +820,8 @@ data: { "expr": "sum by (namespace, job, tenant, method) (http_inflight_requests{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{method}} - {{tenant}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -844,7 +832,7 @@ data: "title": "Inflight requests (per tenant and method)", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -928,10 +916,8 @@ data: { "expr": "sum(rate(thanos_receive_write_timeseries_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", code=~\"2..\"}[$__rate_interval])) by (namespace, job, tenant) ", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{tenant}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -942,7 +928,7 @@ data: "title": "Rate of series received (per tenant, only 2XX)", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1014,10 +1000,8 @@ data: { "expr": "sum(rate(thanos_receive_write_timeseries_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$__rate_interval])) by (tenant, code) ", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{code}} - {{tenant}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1028,7 +1012,7 @@ data: "title": "Rate of series not written (per tenant and code, non 2XX)", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1100,10 +1084,8 @@ data: { "expr": "sum(rate(thanos_receive_write_samples_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", code=~\"2..\"}[$__rate_interval])) by (namespace, job, tenant) ", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{tenant}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1114,7 +1096,7 @@ data: "title": "Rate of samples received (per tenant, only 2XX)", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1186,10 +1168,8 @@ data: { "expr": "sum(rate(thanos_receive_write_samples_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$__rate_interval])) by (tenant, code) ", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{code}} - {{tenant}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1200,7 +1180,7 @@ data: "title": "Rate of samples not written (per tenant and code, non 2XX)", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1285,10 +1265,8 @@ data: { "expr": "sum by (namespace, job) (rate(thanos_receive_replications_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "all {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1299,7 +1277,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1385,7 +1363,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1470,10 +1448,8 @@ data: { "expr": "sum by (namespace, job) (rate(thanos_receive_forward_requests_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "all {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1484,7 +1460,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1524,7 +1500,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "description": "Shows ratio of errors compared to the total number of forwareded requests to other receive nodes.", + "description": "Shows ratio of errors compared to the total number of forwarded requests to other receive nodes.", "fill": 10, "id": 17, "legend": { @@ -1570,7 +1546,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1739,7 +1715,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1824,7 +1800,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1950,7 +1926,7 @@ data: "title": "Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2119,7 +2095,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2204,7 +2180,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2330,7 +2306,7 @@ data: "title": "Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2499,7 +2475,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2584,7 +2560,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2710,7 +2686,7 @@ data: "title": "Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2838,10 +2814,8 @@ data: "expr": "time() - max by (namespace, job, bucket) (thanos_objstore_bucket_last_successful_upload_time{namespace=\"$namespace\", job=~\"$job\"})", "format": "table", "instant": true, - "intervalFactor": 2, "legendFormat": "", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -2852,7 +2826,7 @@ data: "title": "Successful Upload", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "transform": "table", @@ -2937,50 +2911,38 @@ data: { "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc all {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\", job=~\"$job\"}[30s])", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc rate all {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}[30s])", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc rate heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "inuse heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "inuse stack {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2991,7 +2953,7 @@ data: "title": "Memory Used", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -3063,10 +3025,8 @@ data: { "expr": "go_goroutines{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -3077,7 +3037,7 @@ data: "title": "Goroutines", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -3149,10 +3109,8 @@ data: { "expr": "go_gc_duration_seconds{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{quantile}} {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -3163,7 +3121,7 @@ data: "title": "GC Time Quantiles", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -3218,7 +3176,7 @@ data: "value": "default" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-rule.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-rule.configmap.yaml index b30a218335..d44c5a4675 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-rule.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-rule.configmap.yaml @@ -60,10 +60,8 @@ data: { "expr": "sum by (namespace, job, rule_group, strategy) (rate(prometheus_rule_evaluations_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{ rule_group }} {{ strategy }}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -74,7 +72,7 @@ data: "title": "Rule Group Evaluations", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -146,10 +144,8 @@ data: { "expr": "sum by (namespace, job, rule_group, strategy) (rate(prometheus_rule_evaluation_failures_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{ rule_group }} {{ strategy }}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -160,7 +156,7 @@ data: "title": "Rule Group Evaluations Failed", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -232,10 +228,8 @@ data: { "expr": "sum by (namespace, job, rule_group, strategy) (increase(prometheus_rule_group_iterations_missed_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{ rule_group }} {{ strategy }}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -246,7 +240,7 @@ data: "title": "Rule Group Evaluations Missed", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -318,10 +312,8 @@ data: { "expr": "(\n sum by(namespace, job, rule_group) (prometheus_rule_group_last_duration_seconds{namespace=\"$namespace\", job=~\"$job\"})\n >\n sum by(namespace, job, rule_group) (prometheus_rule_group_interval_seconds{namespace=\"$namespace\", job=~\"$job\"})\n)\n", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{ rule_group }}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -332,7 +324,7 @@ data: "title": "Rule Group Evaluations Too Slow", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -417,10 +409,8 @@ data: { "expr": "sum by (namespace, job, alertmanager) (rate(thanos_alert_sender_alerts_dropped_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{alertmanager}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -431,7 +421,7 @@ data: "title": "Dropped Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -504,10 +494,8 @@ data: { "expr": "sum by (namespace, job, alertmanager) (rate(thanos_alert_sender_alerts_sent_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{alertmanager}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -518,7 +506,7 @@ data: "title": "Sent Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -604,7 +592,7 @@ data: "title": "Sent Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -730,7 +718,7 @@ data: "title": "Sent Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -815,10 +803,8 @@ data: { "expr": "sum by (namespace, job) (rate(thanos_alert_queue_alerts_dropped_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -829,7 +815,7 @@ data: "title": "Push Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -915,7 +901,7 @@ data: "title": "Drop Ratio", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1084,7 +1070,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1169,7 +1155,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1295,7 +1281,7 @@ data: "title": "Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1464,7 +1450,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1549,7 +1535,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1675,7 +1661,7 @@ data: "title": "Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1759,50 +1745,38 @@ data: { "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc all {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\", job=~\"$job\"}[30s])", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc rate all {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}[30s])", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc rate heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "inuse heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "inuse stack {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1813,7 +1787,7 @@ data: "title": "Memory Used", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1885,10 +1859,8 @@ data: { "expr": "go_goroutines{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1899,7 +1871,7 @@ data: "title": "Goroutines", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1971,10 +1943,8 @@ data: { "expr": "go_gc_duration_seconds{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{quantile}} {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1985,7 +1955,7 @@ data: "title": "GC Time Quantiles", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2040,7 +2010,7 @@ data: "value": "default" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-store.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-store.configmap.yaml index 876ae39644..555f483536 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-store.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-store.configmap.yaml @@ -145,7 +145,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -230,7 +230,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -356,7 +356,7 @@ data: "title": "Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -525,7 +525,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -610,7 +610,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -736,7 +736,7 @@ data: "title": "Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -821,10 +821,8 @@ data: { "expr": "sum by (namespace, job, operation) (rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}} {{operation}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -835,7 +833,7 @@ data: "title": "Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -908,10 +906,8 @@ data: { "expr": "sum by (namespace, job, operation) (rate(thanos_objstore_bucket_operation_failures_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])) / sum by (namespace, job, operation) (rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}} {{operation}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -922,7 +918,7 @@ data: "title": "Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1025,7 +1021,7 @@ data: "title": "Duration", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1110,10 +1106,8 @@ data: { "expr": "sum by (namespace, job) (rate(thanos_bucket_store_block_loads_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "block loads", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1124,7 +1118,7 @@ data: "title": "Block Load Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1210,7 +1204,7 @@ data: "title": "Block Load Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1283,10 +1277,8 @@ data: { "expr": "sum by (namespace, job, operation) (rate(thanos_bucket_store_block_drops_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "block drops {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1297,7 +1289,7 @@ data: "title": "Block Drop Rate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1383,7 +1375,7 @@ data: "title": "Block Drop Errors", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1468,10 +1460,8 @@ data: { "expr": "sum by (namespace, job, item_type) (rate(thanos_store_index_cache_requests_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}} {{item_type}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1482,7 +1472,7 @@ data: "title": "Requests", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1555,10 +1545,8 @@ data: { "expr": "sum by (namespace, job, item_type) (rate(thanos_store_index_cache_hits_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}} {{item_type}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1569,7 +1557,7 @@ data: "title": "Hits", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1642,10 +1630,8 @@ data: { "expr": "sum by (namespace, job, item_type) (rate(thanos_store_index_cache_items_added_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}} {{item_type}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1656,7 +1642,7 @@ data: "title": "Added", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1729,10 +1715,8 @@ data: { "expr": "sum by (namespace, job, item_type) (rate(thanos_store_index_cache_items_evicted_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}} {{item_type}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1743,7 +1727,7 @@ data: "title": "Evicted", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1828,26 +1812,20 @@ data: { "expr": "histogram_quantile(0.99, sum by (namespace, job, le) (rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "P99", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum by (namespace, job) (rate(thanos_bucket_store_sent_chunk_size_bytes_sum{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])) / sum by (namespace, job) (rate(thanos_bucket_store_sent_chunk_size_bytes_count{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "mean", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "histogram_quantile(0.50, sum by (namespace, job, le) (rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "P50", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1858,7 +1836,7 @@ data: "title": "Chunk Size", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1942,26 +1920,20 @@ data: { "expr": "histogram_quantile(0.99, sum by (le) (rate(thanos_bucket_store_series_blocks_queried{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "P99", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum by (namespace, job) (rate(thanos_bucket_store_series_blocks_queried_sum{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])) / sum by (namespace, job) (rate(thanos_bucket_store_series_blocks_queried_count{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "mean {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "histogram_quantile(0.50, sum by (le) (rate(thanos_bucket_store_series_blocks_queried{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "P50", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1972,7 +1944,7 @@ data: "title": "Block queried", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2045,26 +2017,20 @@ data: { "expr": "histogram_quantile(0.99, sum by (le) (rate(thanos_bucket_store_series_data_fetched{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "P99: {{data_type}} / {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum by (namespace, job, data_type) (rate(thanos_bucket_store_series_data_fetched_sum{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])) / sum by (namespace, job, data_type) (rate(thanos_bucket_store_series_data_fetched_count{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "mean: {{data_type}} / {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "histogram_quantile(0.50, sum by (le) (rate(thanos_bucket_store_series_data_fetched{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "P50: {{data_type}} / {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2075,7 +2041,7 @@ data: "title": "Data Fetched", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2148,26 +2114,20 @@ data: { "expr": "histogram_quantile(0.99, sum by (le) (rate(thanos_bucket_store_series_data_touched{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "P99: {{data_type}} / {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum by (namespace, job, data_type) (rate(thanos_bucket_store_series_data_touched_sum{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])) / sum by (namespace, job, data_type) (rate(thanos_bucket_store_series_data_touched_count{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "mean: {{data_type}} / {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "histogram_quantile(0.50, sum by (le) (rate(thanos_bucket_store_series_data_touched{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "P50: {{data_type}} / {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2178,7 +2138,7 @@ data: "title": "Data Touched", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2250,26 +2210,20 @@ data: { "expr": "histogram_quantile(0.99, sum by (le) (rate(thanos_bucket_store_series_result_series{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "P99", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum by (namespace, job) (rate(thanos_bucket_store_series_result_series_sum{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])) / sum by (namespace, job) (rate(thanos_bucket_store_series_result_series_count{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "mean {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "histogram_quantile(0.50, sum by (le) (rate(thanos_bucket_store_series_result_series{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "P50", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2280,7 +2234,7 @@ data: "title": "Result series", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2418,7 +2372,7 @@ data: "title": "Get All", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2544,7 +2498,7 @@ data: "title": "Merge", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2670,7 +2624,7 @@ data: "title": "Gate", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2754,50 +2708,38 @@ data: { "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc all {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\", job=~\"$job\"}[30s])", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc rate all {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\", job=~\"$job\"}[30s])", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alloc rate heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "inuse heap {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "inuse stack {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2808,7 +2750,7 @@ data: "title": "Memory Used", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2880,10 +2822,8 @@ data: { "expr": "go_goroutines{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2894,7 +2834,7 @@ data: "title": "Goroutines", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2966,10 +2906,8 @@ data: { "expr": "go_gc_duration_seconds{namespace=\"$namespace\", job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{quantile}} {{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2980,7 +2918,7 @@ data: "title": "GC Time Quantiles", "tooltip": { "shared": false, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -3035,7 +2973,7 @@ data: "value": "default" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-rhobs-instance-utilization-overview.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-rhobs-instance-utilization-overview.configmap.yaml index e0ccb7630c..20269d31f0 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-rhobs-instance-utilization-overview.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-rhobs-instance-utilization-overview.configmap.yaml @@ -116,18 +116,14 @@ data: { "expr": "count(ALERTS{service=~\"observatorium.*|telemeter.*\", alertstate=\"firing\", namespace!=\"\"}) by (namespace, alertname, severity)", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{namespace}} - {{severity}} - {{alertname}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "count(ALERTS{service=~\"observatorium.*|telemeter.*\", alertstate=\"firing\", namespace=\"\"}) by (alertname, severity)", "format": "time_series", - "intervalFactor": 2, "legendFormat": "(unknown namespace) - {{severity}} - {{alertname}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -137,8 +133,8 @@ data: "timeShift": null, "title": "Firing alerts by namespace and severity over time", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -259,8 +255,8 @@ data: "timeShift": null, "title": "Rate of requests", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -349,8 +345,8 @@ data: "timeShift": null, "title": "Errors", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -480,8 +476,8 @@ data: "timeShift": null, "title": "Duration", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -642,8 +638,8 @@ data: "timeShift": null, "title": "Replication request count", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -773,8 +769,8 @@ data: "timeShift": null, "title": "Replication request duration", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -863,8 +859,8 @@ data: "timeShift": null, "title": "Replication request errors", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -941,18 +937,14 @@ data: { "expr": "max by (pod) (http_inflight_requests{handler=\"receive\", namespace=\"$namespace\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "concurrency gate used {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "max by (pod) (thanos_receive_write_request_concurrency_write_request_limit{namespace=\"$namespace\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "concurrency gate limit {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -962,8 +954,8 @@ data: "timeShift": null, "title": "Concurrency gate utilization", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1041,10 +1033,8 @@ data: { "expr": "(container_memory_working_set_bytes{container=\"thanos-receive\", pod=~\"observatorium-thanos-receive.*\", namespace=\"$namespace\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "memory usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1054,8 +1044,8 @@ data: "timeShift": null, "title": "Memory Used", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1132,10 +1122,8 @@ data: { "expr": "rate(process_cpu_seconds_total{container=\"thanos-receive\", pod=~\"observatorium-thanos-receive.*\", namespace=\"$namespace\"}[$interval]) * 100", "format": "time_series", - "intervalFactor": 2, "legendFormat": "cpu usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1145,8 +1133,8 @@ data: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1223,10 +1211,8 @@ data: { "expr": "sum by (pod) (kube_pod_container_status_restarts_total{container=\"thanos-receive\", pod=~\"observatorium-thanos-receive.*\", namespace=\"$namespace\",})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "pod restart count {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1236,8 +1222,8 @@ data: "timeShift": null, "title": "Pod/Container Restarts", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1314,18 +1300,14 @@ data: { "expr": "sum by (pod) (rate(container_network_receive_bytes_total{pod=~\"observatorium-thanos-receive.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic in {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum by (pod) (rate(container_network_transmit_bytes_total{pod=~\"observatorium-thanos-receive.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic out {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1335,8 +1317,8 @@ data: "timeShift": null, "title": "Network Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1457,8 +1439,8 @@ data: "timeShift": null, "title": "Rate of requests", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1547,8 +1529,8 @@ data: "timeShift": null, "title": "Errors", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1678,8 +1660,8 @@ data: "timeShift": null, "title": "Duration", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1757,10 +1739,8 @@ data: { "expr": "(container_memory_working_set_bytes{container=\"thanos-query-frontend\", pod=~\"observatorium-thanos-query-frontend.*\", namespace=\"$namespace\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "memory usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1770,8 +1750,8 @@ data: "timeShift": null, "title": "Memory Used", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1848,10 +1828,8 @@ data: { "expr": "rate(process_cpu_seconds_total{container=\"thanos-query-frontend\", pod=~\"observatorium-thanos-query-frontend.*\", namespace=\"$namespace\"}[$interval]) * 100", "format": "time_series", - "intervalFactor": 2, "legendFormat": "cpu usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1861,8 +1839,8 @@ data: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1939,10 +1917,8 @@ data: { "expr": "sum by (pod) (kube_pod_container_status_restarts_total{container=\"thanos-query-frontend\", pod=~\"observatorium-thanos-query-frontend.*\", namespace=\"$namespace\",})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "pod restart count {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1952,8 +1928,8 @@ data: "timeShift": null, "title": "Pod/Container Restarts", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2030,18 +2006,14 @@ data: { "expr": "sum by (pod) (rate(container_network_receive_bytes_total{pod=~\"observatorium-thanos-query-frontend.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic in {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum by (pod) (rate(container_network_transmit_bytes_total{pod=~\"observatorium-thanos-query-frontend.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic out {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2051,8 +2023,8 @@ data: "timeShift": null, "title": "Network Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2173,8 +2145,8 @@ data: "timeShift": null, "title": "Instant Query Rate", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2263,8 +2235,8 @@ data: "timeShift": null, "title": "Instant Query Errors", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2394,8 +2366,8 @@ data: "timeShift": null, "title": "Instant Query Duration", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2504,8 +2476,8 @@ data: "timeShift": null, "title": "Range Query Rate", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2594,8 +2566,8 @@ data: "timeShift": null, "title": "Range Query Errors", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2725,8 +2697,8 @@ data: "timeShift": null, "title": "Range Query Duration", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2798,10 +2770,8 @@ data: { "expr": "max_over_time(thanos_query_concurrent_gate_queries_max{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}} - {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2811,8 +2781,8 @@ data: "timeShift": null, "title": "Concurrent Capacity", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2890,10 +2860,8 @@ data: { "expr": "(container_memory_working_set_bytes{container=\"thanos-query\", pod=~\"observatorium-thanos-query.*\", namespace=\"$namespace\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "memory usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2903,8 +2871,8 @@ data: "timeShift": null, "title": "Memory Used", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2981,10 +2949,8 @@ data: { "expr": "rate(process_cpu_seconds_total{container=\"thanos-query\", pod=~\"observatorium-thanos-query.*\", namespace=\"$namespace\"}[$interval]) * 100", "format": "time_series", - "intervalFactor": 2, "legendFormat": "cpu usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2994,8 +2960,8 @@ data: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -3072,10 +3038,8 @@ data: { "expr": "sum by (pod) (kube_pod_container_status_restarts_total{container=\"thanos-query\", pod=~\"observatorium-thanos-query.*\", namespace=\"$namespace\",})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "pod restart count {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -3085,8 +3049,8 @@ data: "timeShift": null, "title": "Pod/Container Restarts", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -3163,18 +3127,14 @@ data: { "expr": "sum by (pod) (rate(container_network_receive_bytes_total{pod=~\"observatorium-thanos-query.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic in {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum by (pod) (rate(container_network_transmit_bytes_total{pod=~\"observatorium-thanos-query.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic out {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -3184,8 +3144,8 @@ data: "timeShift": null, "title": "Network Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -3306,8 +3266,8 @@ data: "timeShift": null, "title": "Instant Query Rate", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -3396,8 +3356,8 @@ data: "timeShift": null, "title": "Instant Query Errors", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -3527,8 +3487,8 @@ data: "timeShift": null, "title": "Instant Query Duration", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -3637,8 +3597,8 @@ data: "timeShift": null, "title": "Range Query Rate", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -3727,8 +3687,8 @@ data: "timeShift": null, "title": "Range Query Errors", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -3858,8 +3818,8 @@ data: "timeShift": null, "title": "Range Query Duration", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -3931,10 +3891,8 @@ data: { "expr": "max_over_time(thanos_query_concurrent_gate_queries_max{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}} - {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -3944,8 +3902,8 @@ data: "timeShift": null, "title": "Concurrent Capacity", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -4023,10 +3981,8 @@ data: { "expr": "(container_memory_working_set_bytes{container=\"thanos-query\", pod=~\"observatorium-ruler-query.*\", namespace=\"$namespace\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "memory usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -4036,8 +3992,8 @@ data: "timeShift": null, "title": "Memory Used", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -4114,10 +4070,8 @@ data: { "expr": "rate(process_cpu_seconds_total{container=\"thanos-query\", pod=~\"observatorium-ruler-query.*\", namespace=\"$namespace\"}[$interval]) * 100", "format": "time_series", - "intervalFactor": 2, "legendFormat": "cpu usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -4127,8 +4081,8 @@ data: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -4205,10 +4159,8 @@ data: { "expr": "sum by (pod) (kube_pod_container_status_restarts_total{container=\"thanos-query\", pod=~\"observatorium-ruler-query.*\", namespace=\"$namespace\",})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "pod restart count {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -4218,8 +4170,8 @@ data: "timeShift": null, "title": "Pod/Container Restarts", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -4296,18 +4248,14 @@ data: { "expr": "sum by (pod) (rate(container_network_receive_bytes_total{pod=~\"observatorium-ruler-query.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic in {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum by (pod) (rate(container_network_transmit_bytes_total{pod=~\"observatorium-ruler-query.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic out {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -4317,8 +4265,8 @@ data: "timeShift": null, "title": "Network Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -4408,10 +4356,8 @@ data: { "expr": "sum by (job, rule_group) (rate(prometheus_rule_evaluations_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{rule_group}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -4421,8 +4367,8 @@ data: "timeShift": null, "title": "Total evaluations", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -4500,10 +4446,8 @@ data: { "expr": "sum by (job, rule_group) (rate(prometheus_rule_evaluation_failures_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{rule_group}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -4513,8 +4457,8 @@ data: "timeShift": null, "title": "Failed evaluations", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -4591,10 +4535,8 @@ data: { "expr": "sum by (job, strategy) (rate(thanos_rule_evaluation_with_warnings_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{rule_group}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -4604,8 +4546,8 @@ data: "timeShift": null, "title": "Evaluations with warnings", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -4683,10 +4625,8 @@ data: { "expr": "sum by(job, rule_group) (prometheus_rule_group_last_duration_seconds{namespace=\"$namespace\", job=~\"$job\"}) / sum by(job, rule_group) (prometheus_rule_group_interval_seconds{namespace=\"$namespace\", job=~\"$job\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{rule_group}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -4696,8 +4636,8 @@ data: "timeShift": null, "title": "Too slow evaluations", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -4775,10 +4715,8 @@ data: { "expr": "sum by (job) (rate(thanos_alert_sender_alerts_sent_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -4788,8 +4726,8 @@ data: "timeShift": null, "title": "Rate of sent alerts", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -4867,10 +4805,8 @@ data: { "expr": "sum by (job) (rate(thanos_alert_sender_errors_total{namespace=\"$namespace\", job=~\"$job\"}[$interval])) / sum by (job) (rate(thanos_alert_sender_alerts_sent_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -4880,8 +4816,8 @@ data: "timeShift": null, "title": "Rate of send alerts errors", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -4959,26 +4895,20 @@ data: { "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_alert_sender_latency_seconds_bucket{namespace=\"$namespace\", job=~\"$job\"}[$interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "p50", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_alert_sender_latency_seconds_bucket{namespace=\"$namespace\", job=~\"$job\"}[$interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "p90", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_alert_sender_latency_seconds_bucket{namespace=\"$namespace\", job=~\"$job\"}[$interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "p99", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -4988,8 +4918,8 @@ data: "timeShift": null, "title": "Duration od send alerts", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -5067,10 +4997,8 @@ data: { "expr": "(container_memory_working_set_bytes{container=\"thanos-rule\", pod=~\"observatorium-thanos-rule.*\", namespace=\"$namespace\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "memory usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -5080,8 +5008,8 @@ data: "timeShift": null, "title": "Memory Used", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -5158,10 +5086,8 @@ data: { "expr": "rate(process_cpu_seconds_total{container=\"thanos-rule\", pod=~\"observatorium-thanos-rule.*\", namespace=\"$namespace\"}[$interval]) * 100", "format": "time_series", - "intervalFactor": 2, "legendFormat": "cpu usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -5171,8 +5097,8 @@ data: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -5249,18 +5175,14 @@ data: { "expr": "sum by (pod) (rate(container_network_receive_bytes_total{pod=~\"observatorium-thanos-rule.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic in {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum by (pod) (rate(container_network_transmit_bytes_total{pod=~\"observatorium-thanos-rule.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic out {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -5270,8 +5192,8 @@ data: "timeShift": null, "title": "Network Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -5348,10 +5270,8 @@ data: { "expr": "sum by (pod) (kube_pod_container_status_restarts_total{container=\"thanos-rule\", pod=~\"observatorium-thanos-rule.*\", namespace=\"$namespace\",})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "pod restart count {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -5361,8 +5281,8 @@ data: "timeShift": null, "title": "Pod/Container Restarts", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -5535,8 +5455,8 @@ data: "timeShift": null, "title": "Unary gRPC Rate", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -5625,8 +5545,8 @@ data: "timeShift": null, "title": "Unary gRPC Errors", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -5756,8 +5676,8 @@ data: "timeShift": null, "title": "Unary gRPC Duration", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -5918,8 +5838,8 @@ data: "timeShift": null, "title": "Sreamed gRPC Rate", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -6008,8 +5928,8 @@ data: "timeShift": null, "title": "Sreamed gRPC Errors", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -6139,8 +6059,8 @@ data: "timeShift": null, "title": "Sreamed gRPC Duration", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -6212,26 +6132,20 @@ data: { "expr": "histogram_quantile(0.99, sum by (le) (rate(thanos_bucket_store_series_data_touched{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "P99: {{data_type}} / {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum by (namespace, job, data_type) (rate(thanos_bucket_store_series_data_touched_sum{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])) / sum by (namespace, job, data_type) (rate(thanos_bucket_store_series_data_touched_count{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "mean: {{data_type}} / {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "histogram_quantile(0.50, sum by (le) (rate(thanos_bucket_store_series_data_touched{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "P50: {{data_type}} / {{job}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -6241,8 +6155,8 @@ data: "timeShift": null, "title": "Data Touched", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -6372,8 +6286,8 @@ data: "timeShift": null, "title": "Get All", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -6503,8 +6417,8 @@ data: "timeShift": null, "title": "Merge", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -6582,10 +6496,8 @@ data: { "expr": "(container_memory_working_set_bytes{container=\"thanos-store\", pod=~\"observatorium-thanos-store.*\", namespace=\"$namespace\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "memory usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -6595,8 +6507,8 @@ data: "timeShift": null, "title": "Memory Used", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -6673,10 +6585,8 @@ data: { "expr": "rate(process_cpu_seconds_total{container=\"thanos-store\", pod=~\"observatorium-thanos-store.*\", namespace=\"$namespace\"}[$interval]) * 100", "format": "time_series", - "intervalFactor": 2, "legendFormat": "cpu usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -6686,8 +6596,8 @@ data: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -6764,10 +6674,8 @@ data: { "expr": "sum by (pod) (kube_pod_container_status_restarts_total{container=\"thanos-store\", pod=~\"observatorium-thanos-store.*\", namespace=\"$namespace\",})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "pod restart count {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -6777,8 +6685,8 @@ data: "timeShift": null, "title": "Pod/Container Restarts", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -6855,18 +6763,14 @@ data: { "expr": "sum by (pod) (rate(container_network_receive_bytes_total{pod=~\"observatorium-thanos-store.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic in {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum by (pod) (rate(container_network_transmit_bytes_total{pod=~\"observatorium-thanos-store.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic out {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -6876,8 +6780,8 @@ data: "timeShift": null, "title": "Network Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -6967,10 +6871,8 @@ data: { "expr": "sum(rate(gubernator_grpc_request_counts{namespace=\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod)", "format": "time_series", - "intervalFactor": 2, "legendFormat": "gRPC requests {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -6980,8 +6882,8 @@ data: "timeShift": null, "title": "Rate of gRPC requests", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -7059,10 +6961,8 @@ data: { "expr": "sum(rate(gubernator_grpc_request_counts{status=\"failed\",namespace=\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod)", "format": "time_series", - "intervalFactor": 2, "legendFormat": "gRPC request errors {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -7072,8 +6972,8 @@ data: "timeShift": null, "title": "Rate of errors in gRPC requests", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -7151,18 +7051,14 @@ data: { "expr": "gubernator_grpc_request_duration{quantile=\"0.99\", namespace=\"$namespace\",job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "P99: {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "gubernator_grpc_request_duration{quantile=\"0.5\", namespace=\"$namespace\",job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "P50: {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -7172,8 +7068,8 @@ data: "timeShift": null, "title": "Duration of gRPC requests", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -7251,10 +7147,8 @@ data: { "expr": "gubernator_pool_queue_length{namespace=\"$namespace\",job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "local queue size {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -7264,8 +7158,8 @@ data: "timeShift": null, "title": "Local queue of rate checks", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -7343,10 +7237,8 @@ data: { "expr": "gubernator_queue_length{namespace=\"$namespace\",job=~\"$job\"}", "format": "time_series", - "intervalFactor": 2, "legendFormat": "peer queue size {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -7356,8 +7248,8 @@ data: "timeShift": null, "title": "Peer queue of rate checks", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -7429,10 +7321,8 @@ data: { "expr": "(container_memory_working_set_bytes{container=\"gubernator\", pod=~\"observatorium-gubernator.*\", namespace=\"$namespace\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "memory usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -7442,8 +7332,8 @@ data: "timeShift": null, "title": "Memory Used", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -7514,10 +7404,8 @@ data: { "expr": "rate(process_cpu_seconds_total{container=\"gubernator\", pod=~\"observatorium-gubernator.*\", namespace=\"$namespace\"}[$interval]) * 100", "format": "time_series", - "intervalFactor": 2, "legendFormat": "cpu usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -7527,8 +7415,8 @@ data: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -7599,10 +7487,8 @@ data: { "expr": "sum by (pod) (kube_pod_container_status_restarts_total{container=\"gubernator\", pod=~\"observatorium-gubernator.*\", namespace=\"$namespace\",})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "pod restart count {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -7612,8 +7498,8 @@ data: "timeShift": null, "title": "Pod/Container Restarts", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -7684,18 +7570,14 @@ data: { "expr": "sum by (pod) (rate(container_network_receive_bytes_total{pod=~\"observatorium-gubernator.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic in {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum by (pod) (rate(container_network_transmit_bytes_total{pod=~\"observatorium-gubernator.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic out {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -7705,8 +7587,8 @@ data: "timeShift": null, "title": "Network Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -7796,18 +7678,14 @@ data: { "expr": "sum(rate(alertmanager_alerts_received_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod)", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alerts received {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum(rate(alertmanager_alerts_invalid_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod)", "format": "time_series", - "intervalFactor": 2, "legendFormat": "alerts invalid {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -7817,8 +7695,8 @@ data: "timeShift": null, "title": "Alerts receive rate", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -7896,10 +7774,8 @@ data: { "expr": "(container_memory_working_set_bytes{container=\"observatorium-alertmanager\", pod=~\"observatorium-alertmanager.*\", namespace=\"$namespace\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "memory usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -7909,8 +7785,8 @@ data: "timeShift": null, "title": "Memory Used", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -7987,10 +7863,8 @@ data: { "expr": "rate(process_cpu_seconds_total{container=\"observatorium-alertmanager\", pod=~\"observatorium-alertmanager.*\", namespace=\"$namespace\"}[$interval]) * 100", "format": "time_series", - "intervalFactor": 2, "legendFormat": "cpu usage system {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -8000,8 +7874,8 @@ data: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -8078,10 +7952,8 @@ data: { "expr": "sum by (pod) (kube_pod_container_status_restarts_total{container=\"observatorium-alertmanager\", pod=~\"observatorium-alertmanager.*\", namespace=\"$namespace\",})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "pod restart count {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -8091,8 +7963,8 @@ data: "timeShift": null, "title": "Pod/Container Restarts", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -8169,18 +8041,14 @@ data: { "expr": "sum by (pod) (rate(container_network_receive_bytes_total{pod=~\"observatorium-alertmanager.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic in {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum by (pod) (rate(container_network_transmit_bytes_total{pod=~\"observatorium-alertmanager.*\", namespace=\"$namespace\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "network traffic out {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -8190,8 +8058,8 @@ data: "timeShift": null, "title": "Network Usage", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -8281,10 +8149,8 @@ data: { "expr": "sum(rate(obsctl_reloader_prom_rule_sets_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod)", "format": "time_series", - "intervalFactor": 2, "legendFormat": "reloads {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -8294,8 +8160,8 @@ data: "timeShift": null, "title": "Rate of reloads", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -8373,10 +8239,8 @@ data: { "expr": "100 * sum(rate(obsctl_reloader_prom_rule_set_failures_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job, reason) / ignoring (job, reason) group_left sum(rate(obsctl_reloader_prom_rule_sets_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace) > 0", "format": "time_series", - "intervalFactor": 2, "legendFormat": "reload error: {{reason}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -8386,8 +8250,8 @@ data: "timeShift": null, "title": "Percentage of reload errors", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -8465,10 +8329,8 @@ data: { "expr": "sum(rate(obsctl_reloader_prom_rules_store_ops_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod,status_code) > 0", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{status_code}} - pod {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -8478,8 +8340,8 @@ data: "timeShift": null, "title": "Responses from Observatorium Rules API", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -8557,10 +8419,8 @@ data: { "expr": "sum(rate(obsctl_reloader_prom_rule_fetches_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod)", "format": "time_series", - "intervalFactor": 2, "legendFormat": "fetches {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -8570,8 +8430,8 @@ data: "timeShift": null, "title": "Rate of fetches", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -8649,10 +8509,8 @@ data: { "expr": "100 * sum(rate(obsctl_reloader_prom_rule_fetch_failures_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod) / ignoring (job, pod) group_left sum(rate(obsctl_reloader_prom_rule_fetches_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace) > 0", "format": "time_series", - "intervalFactor": 2, "legendFormat": "failed fetches {{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -8662,8 +8520,8 @@ data: "timeShift": null, "title": "Percentage of failed fetches", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -8717,7 +8575,7 @@ data: "value": "default" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ @@ -8733,7 +8591,7 @@ data: "value": "default" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-rules-objstore.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-rules-objstore.configmap.yaml index 8cceb27102..119cd3559e 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-rules-objstore.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-rules-objstore.configmap.yaml @@ -60,10 +60,8 @@ data: { "expr": "sum by (tenant) (rate(rules_objstore_validations_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{tenant}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -73,8 +71,8 @@ data: "timeShift": null, "title": "Successful validations", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -146,10 +144,8 @@ data: { "expr": "sum by (tenant) (rate(rules_objstore_validations_failed_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{tenant}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -159,8 +155,8 @@ data: "timeShift": null, "title": "Failed validations", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -244,10 +240,8 @@ data: { "expr": "sum by (tenant) (rules_objstore_rule_groups_configured{namespace=\"$namespace\", job=~\"$job\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{tenant}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -257,8 +251,8 @@ data: "timeShift": null, "title": "Rule groups configured", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -330,10 +324,8 @@ data: { "expr": "sum by (tenant) (rules_objstore_rules_configured{namespace=\"$namespace\", job=~\"$job\"})", "format": "time_series", - "intervalFactor": 2, "legendFormat": "{{tenant}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -343,8 +335,8 @@ data: "timeShift": null, "title": "Rules configured", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -398,7 +390,7 @@ data: "value": "default" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ @@ -414,7 +406,7 @@ data: "value": "default" }, "hide": 0, - "label": null, + "label": "Data source", "name": "datasource", "options": [ diff --git a/resources/observability/prometheusrules/observatorium-alertmanager-production.prometheusrules.yaml b/resources/observability/prometheusrules/observatorium-alertmanager-production.prometheusrules.yaml index 92037e3df9..3d67a67ba3 100644 --- a/resources/observability/prometheusrules/observatorium-alertmanager-production.prometheusrules.yaml +++ b/resources/observability/prometheusrules/observatorium-alertmanager-production.prometheusrules.yaml @@ -54,7 +54,7 @@ spec: ( rate(alertmanager_notifications_failed_total{job="observatorium-alertmanager"}[5m]) / - rate(alertmanager_notifications_total{job="observatorium-alertmanager"}[5m]) + ignoring (reason) group_left rate(alertmanager_notifications_total{job="observatorium-alertmanager"}[5m]) ) > 0.01 for: 5m @@ -72,7 +72,7 @@ spec: min by (namespace,job, integration) ( rate(alertmanager_notifications_failed_total{job="observatorium-alertmanager", integration=~`slack|pagerduty|email|webhook`}[5m]) / - rate(alertmanager_notifications_total{job="observatorium-alertmanager", integration=~`slack|pagerduty|email|webhook`}[5m]) + ignoring (reason) group_left rate(alertmanager_notifications_total{job="observatorium-alertmanager", integration=~`slack|pagerduty|email|webhook`}[5m]) ) > 0.01 for: 5m @@ -90,7 +90,7 @@ spec: min by (namespace,job, integration) ( rate(alertmanager_notifications_failed_total{job="observatorium-alertmanager", integration!~`slack|pagerduty|email|webhook`}[5m]) / - rate(alertmanager_notifications_total{job="observatorium-alertmanager", integration!~`slack|pagerduty|email|webhook`}[5m]) + ignoring (reason) group_left rate(alertmanager_notifications_total{job="observatorium-alertmanager", integration!~`slack|pagerduty|email|webhook`}[5m]) ) > 0.01 for: 5m diff --git a/resources/observability/prometheusrules/observatorium-alertmanager-stage.prometheusrules.yaml b/resources/observability/prometheusrules/observatorium-alertmanager-stage.prometheusrules.yaml index 5f47018ce8..02bce20834 100644 --- a/resources/observability/prometheusrules/observatorium-alertmanager-stage.prometheusrules.yaml +++ b/resources/observability/prometheusrules/observatorium-alertmanager-stage.prometheusrules.yaml @@ -54,7 +54,7 @@ spec: ( rate(alertmanager_notifications_failed_total{job="observatorium-alertmanager"}[5m]) / - rate(alertmanager_notifications_total{job="observatorium-alertmanager"}[5m]) + ignoring (reason) group_left rate(alertmanager_notifications_total{job="observatorium-alertmanager"}[5m]) ) > 0.01 for: 5m @@ -72,7 +72,7 @@ spec: min by (namespace,job, integration) ( rate(alertmanager_notifications_failed_total{job="observatorium-alertmanager", integration=~`slack|pagerduty|email|webhook`}[5m]) / - rate(alertmanager_notifications_total{job="observatorium-alertmanager", integration=~`slack|pagerduty|email|webhook`}[5m]) + ignoring (reason) group_left rate(alertmanager_notifications_total{job="observatorium-alertmanager", integration=~`slack|pagerduty|email|webhook`}[5m]) ) > 0.01 for: 5m @@ -90,7 +90,7 @@ spec: min by (namespace,job, integration) ( rate(alertmanager_notifications_failed_total{job="observatorium-alertmanager", integration!~`slack|pagerduty|email|webhook`}[5m]) / - rate(alertmanager_notifications_total{job="observatorium-alertmanager", integration!~`slack|pagerduty|email|webhook`}[5m]) + ignoring (reason) group_left rate(alertmanager_notifications_total{job="observatorium-alertmanager", integration!~`slack|pagerduty|email|webhook`}[5m]) ) > 0.01 for: 5m diff --git a/resources/services/app-sre-stage-01/rhobs/telemeter/observatorium-metrics-ruler-telemeter-template.yaml b/resources/services/app-sre-stage-01/rhobs/telemeter/observatorium-metrics-ruler-telemeter-template.yaml index 2de23aca58..e67d15b5c0 100755 --- a/resources/services/app-sre-stage-01/rhobs/telemeter/observatorium-metrics-ruler-telemeter-template.yaml +++ b/resources/services/app-sre-stage-01/rhobs/telemeter/observatorium-metrics-ruler-telemeter-template.yaml @@ -115,17 +115,17 @@ objects: tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 record: acm_top500_mcs:acm_managed_cluster_info - expr: | - max by(_id) (sum_over_time(cluster:usage:workload:capacity_physical_cpu_cores:max:5m[1h:5m]) / scalar(count_over_time(vector(1)[1h:5m]))) + max by(_id) (sum_over_time(cluster:usage:workload:capacity_physical_cpu_cores:max:5m[1h:5m])) / scalar(steps:count1h) labels: tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 record: cluster:usage:workload:capacity_physical_cpu_hours - expr: | - max by(_id) (count_over_time(cluster:usage:workload:capacity_physical_cpu_cores:max:5m[1h:5m])) / scalar(count_over_time(vector(1)[1h:5m])) + max by(_id) (count_over_time(cluster:usage:workload:capacity_physical_cpu_cores:max:5m[1h:5m])) / scalar(steps:count1h) labels: tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 record: cluster:usage:workload:capacity_physical_instance_hours - expr: | - sum(sum_over_time(cluster:capacity_cpu_cores:sum{label_node_role_kubernetes_io = ''}[1h:5m])) by (_id) / scalar(count_over_time(vector(1)[1h:5m])) + sum by(_id) (sum_over_time(cluster:capacity_cpu_cores:sum{label_node_role_kubernetes_io = ''}[1h:5m])) / scalar(steps:count1h) labels: tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 record: cluster:usage:workload:capacity_virtual_cpu_hours @@ -162,7 +162,7 @@ objects: tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 record: acm_capacity_effective_cpu_cores - expr: | - max by(_id)(sum_over_time(hostedcluster:hypershift_cluster_vcpus:max[1h:5m])) / scalar(count_over_time(vector(1)[1h:5m])) + max by(_id) (sum_over_time(hostedcluster:hypershift_cluster_vcpus:max[1h:5m])) / scalar(steps:count1h) labels: tenant_id: FB870BF3-9F3A-44FF-9BF7-D7A047A52F43 record: hostedcluster:hypershift_cluster_vcpus:vcpu_hours diff --git a/resources/services/metric-federation-rule-template.yaml b/resources/services/metric-federation-rule-template.yaml index d8db45cead..727b0a5fbb 100644 --- a/resources/services/metric-federation-rule-template.yaml +++ b/resources/services/metric-federation-rule-template.yaml @@ -214,6 +214,17 @@ objects: requests: cpu: ${THANOS_RULER_CPU_REQUEST} memory: ${THANOS_RULER_MEMORY_REQUEST} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault terminationMessagePolicy: FallbackToLogsOnError volumeMounts: - mountPath: /var/thanos/rule diff --git a/resources/services/observatorium-metrics-template.yaml b/resources/services/observatorium-metrics-template.yaml index ec4b40a09b..f6e895574e 100644 --- a/resources/services/observatorium-metrics-template.yaml +++ b/resources/services/observatorium-metrics-template.yaml @@ -615,6 +615,17 @@ objects: requests: cpu: ${THANOS_RULER_QUERIER_CPU_REQUEST} memory: ${THANOS_RULER_QUERIER_MEMORY_REQUEST} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault terminationMessagePolicy: FallbackToLogsOnError volumeMounts: - mountPath: /etc/thanos/sd @@ -924,6 +935,17 @@ objects: requests: cpu: ${THANOS_QUERIER_CPU_REQUEST} memory: ${THANOS_QUERIER_MEMORY_REQUEST} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault terminationMessagePolicy: FallbackToLogsOnError volumeMounts: - mountPath: /etc/thanos/sd @@ -1244,6 +1266,17 @@ objects: requests: cpu: ${THANOS_QUERY_FRONTEND_CPU_REQUEST} memory: ${THANOS_QUERY_FRONTEND_MEMORY_REQUEST} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault terminationMessagePolicy: FallbackToLogsOnError - args: - -provider=openshift @@ -1563,7 +1596,7 @@ objects: memory: 24Mi securityContext: {} securityContext: {} - serviceAccount: observatorium-thanos-receive-controller + serviceAccountName: observatorium-thanos-receive-controller - apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: @@ -1591,6 +1624,7 @@ objects: resources: - pods verbs: + - list - get - update - apiGroups: @@ -2131,17 +2165,17 @@ objects: "tenant_id": "FB870BF3-9F3A-44FF-9BF7-D7A047A52F43" "record": "acm_top500_mcs:acm_managed_cluster_info" - "expr": | - max by(_id) (sum_over_time(cluster:usage:workload:capacity_physical_cpu_cores:max:5m[1h:5m]) / scalar(count_over_time(vector(1)[1h:5m]))) + max by(_id) (sum_over_time(cluster:usage:workload:capacity_physical_cpu_cores:max:5m[1h:5m])) / scalar(steps:count1h) "labels": "tenant_id": "FB870BF3-9F3A-44FF-9BF7-D7A047A52F43" "record": "cluster:usage:workload:capacity_physical_cpu_hours" - "expr": | - max by(_id) (count_over_time(cluster:usage:workload:capacity_physical_cpu_cores:max:5m[1h:5m])) / scalar(count_over_time(vector(1)[1h:5m])) + max by(_id) (count_over_time(cluster:usage:workload:capacity_physical_cpu_cores:max:5m[1h:5m])) / scalar(steps:count1h) "labels": "tenant_id": "FB870BF3-9F3A-44FF-9BF7-D7A047A52F43" "record": "cluster:usage:workload:capacity_physical_instance_hours" - "expr": | - sum(sum_over_time(cluster:capacity_cpu_cores:sum{label_node_role_kubernetes_io = ''}[1h:5m])) by (_id) / scalar(count_over_time(vector(1)[1h:5m])) + sum by(_id) (sum_over_time(cluster:capacity_cpu_cores:sum{label_node_role_kubernetes_io = ''}[1h:5m])) / scalar(steps:count1h) "labels": "tenant_id": "FB870BF3-9F3A-44FF-9BF7-D7A047A52F43" "record": "cluster:usage:workload:capacity_virtual_cpu_hours" @@ -2178,7 +2212,7 @@ objects: "tenant_id": "FB870BF3-9F3A-44FF-9BF7-D7A047A52F43" "record": "acm_capacity_effective_cpu_cores" - "expr": | - max by(_id)(sum_over_time(hostedcluster:hypershift_cluster_vcpus:max[1h:5m])) / scalar(count_over_time(vector(1)[1h:5m])) + max by(_id) (sum_over_time(hostedcluster:hypershift_cluster_vcpus:max[1h:5m])) / scalar(steps:count1h) "labels": "tenant_id": "FB870BF3-9F3A-44FF-9BF7-D7A047A52F43" "record": "hostedcluster:hypershift_cluster_vcpus:vcpu_hours" @@ -2392,6 +2426,17 @@ objects: requests: cpu: ${THANOS_RULER_CPU_REQUEST} memory: ${THANOS_RULER_MEMORY_REQUEST} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault terminationMessagePolicy: FallbackToLogsOnError volumeMounts: - mountPath: /var/thanos/rule @@ -2962,6 +3007,17 @@ objects: requests: cpu: ${THANOS_STORE_CPU_REQUEST} memory: ${THANOS_STORE_MEMORY_REQUEST} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault terminationMessagePolicy: FallbackToLogsOnError volumeMounts: - mountPath: /var/thanos/store @@ -3219,6 +3275,17 @@ objects: requests: cpu: ${THANOS_STORE_CPU_REQUEST} memory: ${THANOS_STORE_MEMORY_REQUEST} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault terminationMessagePolicy: FallbackToLogsOnError volumeMounts: - mountPath: /var/thanos/store @@ -3476,6 +3543,17 @@ objects: requests: cpu: ${THANOS_STORE_CPU_REQUEST} memory: ${THANOS_STORE_MEMORY_REQUEST} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault terminationMessagePolicy: FallbackToLogsOnError volumeMounts: - mountPath: /var/thanos/store @@ -3733,6 +3811,17 @@ objects: requests: cpu: ${THANOS_STORE_CPU_REQUEST} memory: ${THANOS_STORE_MEMORY_REQUEST} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault terminationMessagePolicy: FallbackToLogsOnError volumeMounts: - mountPath: /var/thanos/store @@ -3990,6 +4079,17 @@ objects: requests: cpu: ${THANOS_STORE_CPU_REQUEST} memory: ${THANOS_STORE_MEMORY_REQUEST} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault terminationMessagePolicy: FallbackToLogsOnError volumeMounts: - mountPath: /var/thanos/store @@ -4247,6 +4347,17 @@ objects: requests: cpu: ${THANOS_STORE_CPU_REQUEST} memory: ${THANOS_STORE_MEMORY_REQUEST} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault terminationMessagePolicy: FallbackToLogsOnError volumeMounts: - mountPath: /var/thanos/store diff --git a/resources/services/telemeter-template.yaml b/resources/services/telemeter-template.yaml index f74c84b612..a95641cee1 100644 --- a/resources/services/telemeter-template.yaml +++ b/resources/services/telemeter-template.yaml @@ -902,7 +902,7 @@ objects: - --oidc.client-id=$(OIDC_CLIENT_ID) - --oidc.client-secret=$(OIDC_CLIENT_SECRET) - --oidc.issuer-url=$(OIDC_ISSUER_URL) - - --url=http://observatorium-observatorium-api.${OBSERVATORIUM_NAMESPACE}.svc:8080/api/metrics/v1/telemeter + - --upstream.url=http://observatorium-observatorium-api.${OBSERVATORIUM_NAMESPACE}.svc:8080/api/metrics/v1/telemeter env: - name: OIDC_AUDIENCE valueFrom: