diff --git a/deploy_managed_infra/grafana_dashboard_templates/Cluster_View.json b/deploy_managed_infra/grafana_dashboard_templates/Cluster_View.json old mode 100755 new mode 100644 index eb857dd..bf05359 --- a/deploy_managed_infra/grafana_dashboard_templates/Cluster_View.json +++ b/deploy_managed_infra/grafana_dashboard_templates/Cluster_View.json @@ -1,4 +1,53 @@ { + "__inputs": [ + { + "name": "DS_MANAGED_PROMETHEUS_MONEO-AMW", + "label": "Managed_Prometheus_moneo-amw", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.5.16" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], "annotations": { "list": [ { @@ -24,6 +73,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, + "id": null, "links": [ { "asDropdown": true, @@ -45,7 +95,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -86,12 +136,12 @@ }, "textMode": "auto" }, - "pluginVersion": "9.5.13", + "pluginVersion": "9.5.16", "targets": [ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -111,7 +161,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -156,12 +206,12 @@ }, "textMode": "auto" }, - "pluginVersion": "9.5.13", + "pluginVersion": "9.5.16", "targets": [ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -180,7 +230,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -226,12 +276,12 @@ "showThresholdLabels": false, "showThresholdMarkers": true }, - "pluginVersion": "9.5.13", + "pluginVersion": "9.5.16", "targets": [ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -250,7 +300,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -296,12 +346,12 @@ "showThresholdLabels": false, "showThresholdMarkers": true }, - "pluginVersion": "9.5.13", + "pluginVersion": "9.5.16", "targets": [ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -333,7 +383,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -415,7 +465,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "(\r\n average_ib_port_xmit_data{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 1) +\r\n min_ib_port_xmit_data{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 2) +\r\n max_ib_port_xmit_data{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 3)\r\n)", @@ -431,7 +481,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -512,7 +562,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "(\r\n average_ib_port_rcv_data{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 1) +\r\n min_ib_port_rcv_data{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 2) +\r\n max_ib_port_rcv_data{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 3)\r\n)", @@ -541,7 +591,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation on VM GPU device utilization", "fieldConfig": { @@ -623,7 +673,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -643,7 +693,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation on VM GPU device utilization", "fieldConfig": { @@ -727,7 +777,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -760,7 +810,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation on VM GPU device SM Clock", "fieldConfig": { @@ -842,7 +892,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -862,7 +912,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation of VM GPU device Memory Clock", "fieldConfig": { @@ -944,7 +994,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -977,7 +1027,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation of VM GPU device Temperature", "fieldConfig": { @@ -1059,7 +1109,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -1079,7 +1129,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Memory temperature (in C)", "fieldConfig": { @@ -1161,7 +1211,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -1194,7 +1244,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation of VM GPU Power", "fieldConfig": { @@ -1275,7 +1325,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -1295,7 +1345,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total energy consumption since boot (in mJ)", "fieldConfig": { @@ -1376,7 +1426,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -1409,7 +1459,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -1523,7 +1573,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "ib_port_physical_state{subscription=\"$Subscription\", cluster=\"$Cluster\", job_id=~\"$JobId\"} == 0\r\n", @@ -1614,7 +1664,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -1677,7 +1727,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "dcgm_gpu_temp{subscription=\"$Subscription\", cluster=\"$Cluster\",gpu_id=\"0\"}", @@ -1786,14 +1836,10 @@ "type": "custom" }, { - "current": { - "selected": false, - "text": "9cd0dc62-fcc2-4b6b-abd3-6010a01a8109", - "value": "9cd0dc62-fcc2-4b6b-abd3-6010a01a8109" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(up,subscription)", "hide": 0, @@ -1813,15 +1859,10 @@ "type": "query" }, { - "current": { - "isNone": true, - "selected": false, - "text": "None", - "value": "" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(dcgm_gpu_utilization{subscription=\"$Subscription\"},cluster)", "hide": 0, @@ -1841,15 +1882,10 @@ "type": "query" }, { - "current": { - "isNone": true, - "selected": false, - "text": "None", - "value": "" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(dcgm_gpu_utilization{cluster=~\"$Cluster\"},job_id)", "hide": 0, diff --git a/deploy_managed_infra/grafana_dashboard_templates/GPU_View.json b/deploy_managed_infra/grafana_dashboard_templates/GPU_View.json old mode 100755 new mode 100644 index d02e41e..c7468e2 --- a/deploy_managed_infra/grafana_dashboard_templates/GPU_View.json +++ b/deploy_managed_infra/grafana_dashboard_templates/GPU_View.json @@ -1,4 +1,41 @@ { + "__inputs": [ + { + "name": "DS_MANAGED_PROMETHEUS_MONEO-AMW", + "label": "Managed_Prometheus_moneo-amw", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.5.16" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], "annotations": { "list": [ { @@ -24,6 +61,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, + "id": null, "links": [ { "asDropdown": true, @@ -58,7 +96,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation on VM GPU device utilization", "fieldConfig": { @@ -140,7 +178,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "builder", "exemplar": false, @@ -160,7 +198,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation on VM GPU device utilization", "fieldConfig": { @@ -242,7 +280,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "builder", "exemplar": false, @@ -275,7 +313,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation on VM GPU device SM Clock", "fieldConfig": { @@ -357,7 +395,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -377,7 +415,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation of VM GPU device Memory Clock", "fieldConfig": { @@ -459,7 +497,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -492,7 +530,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "The rate of data transmitted over NVLink.", "fieldConfig": { @@ -574,7 +612,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -594,7 +632,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "The rate of data received over NVLink.", "fieldConfig": { @@ -676,7 +714,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -709,7 +747,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation of VM GPU device Temperature", "fieldConfig": { @@ -791,7 +829,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "builder", "exemplar": false, @@ -811,7 +849,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Memory temperature (in C)", "fieldConfig": { @@ -893,7 +931,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -926,7 +964,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Selected Operation of VM GPU Power", "fieldConfig": { @@ -1007,7 +1045,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -1027,7 +1065,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total energy consumption since boot (in mJ)", "fieldConfig": { @@ -1108,7 +1146,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -1138,7 +1176,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Current throttle code ", "fieldConfig": { @@ -1217,7 +1255,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "dcgm_current_clock_throttle_reasons{subscription=\"$Subscription\", cluster=\"$Cluster\", instance=~\"$Instance\", gpu_id=~\"$GPU\"}", @@ -1246,7 +1284,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total number of single-bit volatile ECC errors", "fieldConfig": { @@ -1326,7 +1364,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "dcgm_ecc_sbe_volatile_total{subscription=\"$Subscription\", cluster=\"$Cluster\", instance=~\"$Instance\", gpu_id=~\"$GPU\"}", @@ -1341,7 +1379,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total number of double-bit volatile ECC errors", "fieldConfig": { @@ -1421,7 +1459,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "dcgm_ecc_dbe_volatile_total{subscription=\"$Subscription\", cluster=\"$Cluster\", instance=~\"$Instance\", gpu_id=~\"$GPU\"}", @@ -1436,7 +1474,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total number of double-bit persistent ECC errors", "fieldConfig": { @@ -1516,7 +1554,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "dcgm_ecc_dbe_aggregate_total{subscription=\"$Subscription\", cluster=\"$Cluster\", instance=~\"$Instance\", gpu_id=~\"$GPU\"}", @@ -1531,7 +1569,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total number of single-bit persistent ECC errors", "fieldConfig": { @@ -1611,7 +1649,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "dcgm_ecc_sbe_aggregate_total{subscription=\"$Subscription\", cluster=\"$Cluster\", instance=~\"$Instance\", gpu_id=~\"$GPU\"}", @@ -1640,7 +1678,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -1702,7 +1740,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "dcgm_gpu_temp{subscription=\"$Subscription\", cluster=\"$Cluster\", instance=~\"$Instance\",gpu_id=\"0\"}", @@ -1783,7 +1821,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -1859,7 +1897,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "expr": "node_gpu_burn_mon{subscription=~\"$Subscription\", cluster=~\"$Cluster\", instance=~\"$Instance\", gpu_id=~\"$GPU\"}", @@ -1874,7 +1912,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -1950,7 +1988,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "builder", "expr": "node_meta_seq_mon{subscription=~\"$Subscription\", cluster=\"$Cluster\", instance=~\"$Instance\"}", @@ -1977,14 +2015,10 @@ "templating": { "list": [ { - "current": { - "selected": false, - "text": "d71c7216-6409-45f8-be15-35cf57b8527c", - "value": "d71c7216-6409-45f8-be15-35cf57b8527c" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(dcgm_gpu_utilization,subscription)", "hide": 0, @@ -2004,14 +2038,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "none", - "value": "none" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(dcgm_gpu_utilization{subscription=\"$Subscription\"}, cluster)", "hide": 0, @@ -2031,14 +2061,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "none", - "value": "none" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(dcgm_gpu_utilization{cluster=~\"$Cluster\"}, job_id)", "hide": 0, @@ -2058,14 +2084,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "none", - "value": "none" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(dcgm_gpu_utilization{cluster=\"$Cluster\", job_id=~\"$JobId\"}, instance)", "hide": 0, @@ -2085,18 +2107,10 @@ "type": "query" }, { - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(dcgm_gpu_utilization{instance=~\"$Instance\"},gpu_id)", "hide": 0, diff --git a/deploy_managed_infra/grafana_dashboard_templates/Network_View.json b/deploy_managed_infra/grafana_dashboard_templates/Network_View.json old mode 100755 new mode 100644 index 61c6d43..3b7b39d --- a/deploy_managed_infra/grafana_dashboard_templates/Network_View.json +++ b/deploy_managed_infra/grafana_dashboard_templates/Network_View.json @@ -1,4 +1,41 @@ { + "__inputs": [ + { + "name": "DS_MANAGED_PROMETHEUS_MONEO-AMW", + "label": "Managed_Prometheus_moneo-amw", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.5.16" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], "annotations": { "list": [ { @@ -24,6 +61,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, + "id": null, "links": [ { "asDropdown": true, @@ -58,7 +96,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Indication of IB Link Flap", "fieldConfig": { @@ -157,7 +195,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "builder", "exemplar": false, @@ -177,7 +215,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "The rate of data transmitted over InfiniBand.", "fieldConfig": { @@ -259,7 +297,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -279,7 +317,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "The rate of data transmitted over InfiniBand.", "fieldConfig": { @@ -361,7 +399,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -391,7 +429,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total number of outbound packets discarded by the port because the port is down or congested.", "fieldConfig": { @@ -471,7 +509,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -491,7 +529,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total number of packets not transmitted from the switch physical port.", "fieldConfig": { @@ -571,7 +609,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -591,7 +629,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total number of packets containing an error that were received on the port.", "fieldConfig": { @@ -671,7 +709,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -691,7 +729,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Total number of packets received on the switch physical port that are discarded.", "fieldConfig": { @@ -771,7 +809,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -791,7 +829,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Indication of IB Link Flap", "fieldConfig": { @@ -856,7 +894,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -935,7 +973,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -989,7 +1027,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "builder", "expr": "ib_port_rcv_errors{subscription=\"$Subscription\", cluster=\"$Cluster\", instance=~\"$Instance\", ib_port=\"mlx5_ib0:1\"}", @@ -1068,14 +1106,10 @@ "templating": { "list": [ { - "current": { - "selected": false, - "text": "d71c7216-6409-45f8-be15-35cf57b8527c", - "value": "d71c7216-6409-45f8-be15-35cf57b8527c" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(ib_port_physical_state, subscription)", "hide": 0, @@ -1095,14 +1129,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "none", - "value": "none" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(ib_port_physical_state{subscription=~\"$Subscription\"}, cluster)", "hide": 0, @@ -1122,14 +1152,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "none", - "value": "none" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(ib_port_physical_state{cluster=\"$Cluster\"},job_id)", "hide": 0, @@ -1149,14 +1175,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "none", - "value": "none" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(ib_port_physical_state{cluster=\"$Cluster\", job_id=\"$JobId\"},instance)", "hide": 0, @@ -1176,18 +1198,10 @@ "type": "query" }, { - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(ib_port_physical_state{instance=~\"$Instance\"}, ib_port)", "hide": 0, diff --git a/deploy_managed_infra/grafana_dashboard_templates/Node_View.json b/deploy_managed_infra/grafana_dashboard_templates/Node_View.json old mode 100755 new mode 100644 index f481833..683b558 --- a/deploy_managed_infra/grafana_dashboard_templates/Node_View.json +++ b/deploy_managed_infra/grafana_dashboard_templates/Node_View.json @@ -1,4 +1,41 @@ { + "__inputs": [ + { + "name": "DS_MANAGED_PROMETHEUS_MONEO-AMW", + "label": "Managed_Prometheus_moneo-amw", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.5.16" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], "annotations": { "list": [ { @@ -25,6 +62,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, + "id": null, "links": [ { "asDropdown": true, @@ -59,7 +97,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "CPU Utilization", "fieldConfig": { @@ -147,7 +185,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -167,7 +205,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "CPU Utilization", "fieldConfig": { @@ -255,7 +293,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -285,7 +323,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Memory Utilization", "fieldConfig": { @@ -372,7 +410,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -392,7 +430,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "Memory Utilization", "fieldConfig": { @@ -479,7 +517,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -513,7 +551,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "TX Rate of VM's Ethernet Interface", "fieldConfig": { @@ -597,7 +635,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -617,7 +655,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "description": "RX Rate of VM's Ethernet Interface", "fieldConfig": { @@ -701,7 +739,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "code", "exemplar": false, @@ -738,7 +776,7 @@ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "fieldConfig": { "defaults": { @@ -788,12 +826,12 @@ }, "showHeader": true }, - "pluginVersion": "9.5.13", + "pluginVersion": "9.5.16", "targets": [ { "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "editorMode": "builder", "expr": "node_mem_util{subscription=\"$Subscription\", cluster=\"$Cluster\", instance=\"$Instance\"}", @@ -877,14 +915,10 @@ "templating": { "list": [ { - "current": { - "selected": false, - "text": "d71c7216-6409-45f8-be15-35cf57b8527c", - "value": "d71c7216-6409-45f8-be15-35cf57b8527c" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(up,subscription)", "hide": 0, @@ -904,14 +938,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "none", - "value": "none" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(up{subscription=\"$Subscription\"},cluster)", "hide": 0, @@ -931,15 +961,10 @@ "type": "query" }, { - "current": { - "isNone": true, - "selected": false, - "text": "None", - "value": "" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(up{cluster=\"$Cluster\"},job_id)", "description": "", @@ -960,14 +985,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "none", - "value": "none" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(up{cluster=\"$Cluster\", job_id=\"$JobId\"},instance)", "hide": 0, @@ -987,14 +1008,10 @@ "type": "query" }, { - "current": { - "selected": false, - "text": "All", - "value": "$__all" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "moneo-amw" + "uid": "${DS_MANAGED_PROMETHEUS_MONEO-AMW}" }, "definition": "label_values(node_cpu_util{instance=~\"$Instance\"},numa_domain)", "hide": 0, diff --git a/linux_service/README.md b/linux_service/README.md index 16f090e..049e319 100644 --- a/linux_service/README.md +++ b/linux_service/README.md @@ -67,6 +67,10 @@ The [start_moneo_services.sh](./start_moneo_services.sh) script is used to start or ```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/start_moneo_services.sh geneva"``` +#### Exporters Alone #### + +```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/start_moneo_services.sh workers"``` + #### Exporters with Managed Prometheus #### ```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/start_moneo_services.sh"``` diff --git a/linux_service/start_moneo_services.sh b/linux_service/start_moneo_services.sh index fc12fd0..5d05dfd 100755 --- a/linux_service/start_moneo_services.sh +++ b/linux_service/start_moneo_services.sh @@ -5,6 +5,7 @@ # Managed Prometheus deployment: ./start_moneo_services.sh # Azure Monitor: ./start_moneo_services.sh azure_monitor # Geneva (internal msft): ./start_moneo_services.sh geneva +# Only start workers: ./start_moneo_services.sh workers PublisherMethod=$1 # Modify as necessary @@ -20,22 +21,24 @@ fi procs=("net_exporter" "node_exporter") -if lspci | grep -iq NVIDIA ; then +if lspci | grep -iq NVIDIA; then procs+=("nvidia_exporter") fi if [[ -n $PublisherMethod ]]; then if [ "$PublisherMethod" == "geneva" ] || [ "$PublisherMethod" == "azure_monitor" ]; then echo "PublisherMethod is valid: $PublisherMethod" + procs+=("metrics_publisher") + elif [ "$PublisherMethod" == "workers" ]; then + echo "Only starting workers" else echo "PublisherMethod is not one of the valid choices." exit 1 fi - procs+=("metrics_publisher") fi function proc_check(){ - CHECK=`ps -eaf | grep /tmp/moneo-worker/` + CHECK=$( ps -eaf | grep /tmp/moneo-worker/) WITH_MANAGED_PROM=$1 for substring in "${procs[@]}"; do if [[ $CHECK == *"$substring"* ]]; then @@ -48,7 +51,7 @@ function proc_check(){ if [[ -n $WITH_MANAGED_PROM && $WITH_MANAGED_PROM = true ]]; then - if [[ $(sudo docker ps -a | grep prometheus) ]] ; then + if [[ $(sudo docker ps -a | grep prometheus) ]]; then echo "Prometheus docker containers running." else echo "Prometheus failed to start. Please ensure you have the proper user managed identity assigned to your VMSS/VM." @@ -69,6 +72,10 @@ sudo systemctl start moneo@node_exporter.service sudo systemctl start moneo@net_exporter.service sudo systemctl start moneo@nvidia_exporter.service +if [ "$PublisherMethod" == "workers" ]; then + proc_check false +fi + if [[ -n $PublisherMethod ]]; then if [ "$PublisherMethod" == "geneva" ]; then sudo $MONEO_PATH/src/worker/start_geneva.sh $PUBLISHER_AUTH /tmp/moneo-worker/publisher/config