diff --git a/README.md b/README.md index 6f22354..5c80526 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,7 @@ This method requires a deploying of a head node to host the local Prometheus dat - The headnode must have enough storage available to facilitate data collection - Grafana and Prometheus are accessed via web browser. Ensure proper access from web browser to headnode IP. -Complete the steps listed here: [Local Grafana Deployment Guide](./docs/LocalGrafanDeployment.md) +Complete the steps listed here: [Local Grafana Deployment Guide](./docs/LocalGrafanaDeployment.md) ### Moneo CLI ### @@ -156,7 +156,7 @@ Note: For more options check the Moneo help menu ## User Docs ## - [Headless Deployment Guide](./docs/HeadlessDeployment.md) -- [Local Grafana Deployment Guide](./docs/LocalGrafanDeployment.md) +- [Local Grafana Deployment Guide](./docs/LocalGrafanaDeployment.md) - To get started with job level filtering see: [Job Level Filtering](./docs/JobFiltering.md) - Slurm epilog/prolog integration: [Slurm example](./examples/slurm/README.md) - To deploy moneo-worker inside container: [Moneo-exporter](./docs/Moneo-exporter.md) diff --git a/deploy_managed_infra/grafana_dashboard_templates/Cluster_View.json b/deploy_managed_infra/grafana_dashboard_templates/Cluster_View.json index 41bf7f8..1160540 100755 --- a/deploy_managed_infra/grafana_dashboard_templates/Cluster_View.json +++ b/deploy_managed_infra/grafana_dashboard_templates/Cluster_View.json @@ -43,13 +43,288 @@ ], "liveNow": false, "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "moneo-amw" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 35, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^cluster$/", + "values": true + }, + "textMode": "auto" + }, + "pluginVersion": "9.5.13", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "moneo-amw" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by(cluster) (python_info{subscription=\"$Subscription\", cluster=\"$Cluster\"})", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "range": false, + "refId": "A" + } + ], + "title": "Cluster Name", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "moneo-amw" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "dark-green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 7, + "y": 0 + }, + "id": 34, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.5.13", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "moneo-amw" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count (dcgm_gpu_utilization{subscription=\"$Subscription\", cluster=\"$Cluster\", job_id=\"$JobId\", gpu_id=\"0\"})", + "format": "table", + "instant": true, + "legendFormat": "", + "range": false, + "refId": "A" + } + ], + "title": "Total Node in Cluster", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "moneo-amw" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-green", + "value": null + }, + { + "color": "red", + "value": 100 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 36, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.5.13", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "moneo-amw" + }, + "editorMode": "code", + "exemplar": false, + "expr": "avg (\r\n average_dcgm_gpu_utilization{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 1) +\r\n min_dcgm_gpu_utilization{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 2) +\r\n max_dcgm_gpu_utilization{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 3)\r\n)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Average GPU Utilization", + "transparent": true, + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "moneo-amw" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-green", + "value": null + }, + { + "color": "red", + "value": 100 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 37, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.5.13", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "moneo-amw" + }, + "editorMode": "code", + "exemplar": false, + "expr": "avg (\r\n average_dcgm_mem_copy_utilization{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 1) +\r\n min_dcgm_mem_copy_utilization{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 2) +\r\n max_dcgm_mem_copy_utilization{subscription=\"$Subscription\", cluster=\"$Cluster\",job_id=~\"$JobId\"} * ($Operation== bool 3)\r\n)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Average GPU Mem Utilization", + "transparent": true, + "type": "gauge" + }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 0 + "y": 8 }, "id": 29, "panels": [], @@ -119,7 +394,7 @@ "h": 8, "w": 12, "x": 0, - "y": 1 + "y": 9 }, "id": 30, "options": { @@ -151,6 +426,7 @@ } ], "title": "IB TX Rate", + "transparent": true, "type": "timeseries" }, { @@ -215,7 +491,7 @@ "h": 8, "w": 12, "x": 12, - "y": 1 + "y": 9 }, "id": 31, "options": { @@ -247,6 +523,7 @@ } ], "title": "IB RX Rate", + "transparent": true, "type": "timeseries" }, { @@ -255,7 +532,7 @@ "h": 1, "w": 24, "x": 0, - "y": 9 + "y": 17 }, "id": 11, "panels": [], @@ -325,7 +602,7 @@ "h": 8, "w": 12, "x": 0, - "y": 10 + "y": 18 }, "id": 2, "options": { @@ -427,7 +704,7 @@ "h": 8, "w": 12, "x": 12, - "y": 10 + "y": 18 }, "id": 24, "options": { @@ -474,7 +751,7 @@ "h": 1, "w": 24, "x": 0, - "y": 18 + "y": 26 }, "id": 17, "panels": [], @@ -544,7 +821,7 @@ "h": 10, "w": 12, "x": 0, - "y": 19 + "y": 27 }, "id": 7, "options": { @@ -646,7 +923,7 @@ "h": 10, "w": 12, "x": 12, - "y": 19 + "y": 27 }, "id": 9, "options": { @@ -691,7 +968,7 @@ "h": 1, "w": 24, "x": 0, - "y": 29 + "y": 37 }, "id": 13, "panels": [], @@ -760,7 +1037,7 @@ "h": 8, "w": 12, "x": 0, - "y": 30 + "y": 38 }, "id": 3, "options": { @@ -862,7 +1139,7 @@ "h": 8, "w": 12, "x": 12, - "y": 30 + "y": 38 }, "id": 4, "options": { @@ -908,7 +1185,7 @@ "h": 1, "w": 24, "x": 0, - "y": 38 + "y": 46 }, "id": 15, "panels": [], @@ -977,7 +1254,7 @@ "h": 9, "w": 12, "x": 0, - "y": 39 + "y": 47 }, "id": 6, "options": { @@ -1078,7 +1355,7 @@ "h": 9, "w": 12, "x": 12, - "y": 39 + "y": 47 }, "id": 8, "options": { @@ -1123,7 +1400,7 @@ "h": 1, "w": 24, "x": 0, - "y": 48 + "y": 56 }, "id": 32, "panels": [], @@ -1226,7 +1503,7 @@ "h": 8, "w": 12, "x": 0, - "y": 49 + "y": 57 }, "id": 33, "options": { @@ -1242,7 +1519,7 @@ "showHeader": true, "sortBy": [] }, - "pluginVersion": "9.5.6", + "pluginVersion": "9.5.8", "targets": [ { "datasource": { @@ -1331,7 +1608,7 @@ "h": 1, "w": 24, "x": 0, - "y": 57 + "y": 65 }, "id": 28, "panels": [ @@ -1373,7 +1650,7 @@ "h": 10, "w": 12, "x": 0, - "y": 31 + "y": 55 }, "id": 26, "options": { @@ -1470,7 +1747,7 @@ "type": "row" } ], - "refresh": "1m", + "refresh": "", "revision": 1, "schemaVersion": 38, "style": "dark", @@ -1512,14 +1789,14 @@ { "current": { "selected": false, - "text": "d71c7216-6409-45f8-be15-35cf57b8527c", - "value": "d71c7216-6409-45f8-be15-35cf57b8527c" + "text": "9cd0dc62-fcc2-4b6b-abd3-6010a01a8109", + "value": "9cd0dc62-fcc2-4b6b-abd3-6010a01a8109" }, "datasource": { "type": "prometheus", "uid": "moneo-amw" }, - "definition": "label_values(dcgm_gpu_utilization, subscription)", + "definition": "label_values(up,subscription)", "hide": 0, "includeAll": false, "label": "Subscription", @@ -1527,10 +1804,10 @@ "name": "Subscription", "options": [], "query": { - "query": "label_values(dcgm_gpu_utilization, subscription)", - "refId": "StandardVariableQuery" + "query": "label_values(up,subscription)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" }, - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, @@ -1538,15 +1815,16 @@ }, { "current": { + "isNone": true, "selected": false, - "text": "ndv4-test-t", - "value": "ndv4-test-t" + "text": "None", + "value": "" }, "datasource": { "type": "prometheus", "uid": "moneo-amw" }, - "definition": "label_values(dcgm_gpu_utilization{subscription=\"$Subscription\"}, cluster)", + "definition": "label_values(dcgm_gpu_utilization{subscription=\"$Subscription\"},cluster)", "hide": 0, "includeAll": false, "label": "Cluster", @@ -1554,8 +1832,8 @@ "name": "Cluster", "options": [], "query": { - "query": "label_values(dcgm_gpu_utilization{subscription=\"$Subscription\"}, cluster)", - "refId": "StandardVariableQuery" + "query": "label_values(dcgm_gpu_utilization{subscription=\"$Subscription\"},cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", @@ -1565,15 +1843,16 @@ }, { "current": { + "isNone": true, "selected": false, - "text": "none", - "value": "none" + "text": "None", + "value": "" }, "datasource": { "type": "prometheus", "uid": "moneo-amw" }, - "definition": "label_values(dcgm_gpu_utilization{cluster=~\"$Cluster\"}, job_id)", + "definition": "label_values(dcgm_gpu_utilization{cluster=~\"$Cluster\"},job_id)", "hide": 0, "includeAll": false, "label": "Job Id", @@ -1581,8 +1860,8 @@ "name": "JobId", "options": [], "query": { - "query": "label_values(dcgm_gpu_utilization{cluster=~\"$Cluster\"}, job_id)", - "refId": "StandardVariableQuery" + "query": "label_values(dcgm_gpu_utilization{cluster=~\"$Cluster\"},job_id)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", @@ -1611,7 +1890,6 @@ "timezone": "utc", "title": "Cluster Unified View (Experimental)", "uid": "e12394be-6c26-4c19-a089-f69930b17e7e", - "version": 62, + "version": 75, "weekStart": "" } - diff --git a/deploy_managed_infra/grafana_dashboard_templates/GPU_View.json b/deploy_managed_infra/grafana_dashboard_templates/GPU_View.json index b311f74..3033e0f 100755 --- a/deploy_managed_infra/grafana_dashboard_templates/GPU_View.json +++ b/deploy_managed_infra/grafana_dashboard_templates/GPU_View.json @@ -753,8 +753,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -856,8 +855,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -972,8 +970,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1074,8 +1071,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1830,8 +1826,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1846,7 +1841,7 @@ "h": 8, "w": 12, "x": 0, - "y": 52 + "y": 1 }, "id": 44, "options": { @@ -1922,8 +1917,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1938,7 +1932,7 @@ "h": 8, "w": 12, "x": 12, - "y": 52 + "y": 1 }, "id": 41, "options": { @@ -1974,7 +1968,7 @@ "type": "row" } ], - "refresh": "1m", + "refresh": "", "revision": 1, "schemaVersion": 38, "style": "dark", @@ -1993,7 +1987,7 @@ "type": "prometheus", "uid": "moneo-amw" }, - "definition": "label_values(dcgm_gpu_utilization, subscription)", + "definition": "label_values(dcgm_gpu_utilization,subscription)", "hide": 0, "includeAll": false, "label": "Subscription", @@ -2001,10 +1995,10 @@ "name": "Subscription", "options": [], "query": { - "query": "label_values(dcgm_gpu_utilization, subscription)", - "refId": "StandardVariableQuery" + "query": "label_values(dcgm_gpu_utilization,subscription)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" }, - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, @@ -2013,8 +2007,8 @@ { "current": { "selected": false, - "text": "yangwang1-integration-vmss", - "value": "yangwang1-integration-vmss" + "text": "none", + "value": "none" }, "datasource": { "type": "prometheus", @@ -2039,13 +2033,9 @@ }, { "current": { - "selected": true, - "text": [ - "none" - ], - "value": [ - "none" - ] + "selected": false, + "text": "none", + "value": "none" }, "datasource": { "type": "prometheus", @@ -2071,8 +2061,8 @@ { "current": { "selected": false, - "text": "yangwa0ae0000cn", - "value": "yangwa0ae0000cn" + "text": "none", + "value": "none" }, "datasource": { "type": "prometheus", @@ -2082,7 +2072,7 @@ "hide": 0, "includeAll": false, "label": "Instance", - "multi": true, + "multi": false, "name": "Instance", "options": [], "query": { @@ -2146,7 +2136,6 @@ "timezone": "utc", "title": "GPU View", "uid": "dHpbWBP4z", - "version": 41, + "version": 43, "weekStart": "" } - diff --git a/deploy_managed_infra/grafana_dashboard_templates/Network_View.json b/deploy_managed_infra/grafana_dashboard_templates/Network_View.json index b52ebfb..d81d702 100755 --- a/deploy_managed_infra/grafana_dashboard_templates/Network_View.json +++ b/deploy_managed_infra/grafana_dashboard_templates/Network_View.json @@ -1059,7 +1059,7 @@ "type": "row" } ], - "refresh": "1m", + "refresh": "", "revision": 1, "schemaVersion": 38, "style": "dark", @@ -1097,13 +1097,9 @@ }, { "current": { - "selected": true, - "text": [ - "yangwang1-integration-vmss" - ], - "value": [ - "yangwang1-integration-vmss" - ] + "selected": false, + "text": "none", + "value": "none" }, "datasource": { "type": "prometheus", @@ -1136,7 +1132,7 @@ "type": "prometheus", "uid": "moneo-amw" }, - "definition": "label_values(ib_port_physical_state{cluster=~\"$Cluster\"}, job_id)", + "definition": "label_values(ib_port_physical_state{cluster=\"$Cluster\"},job_id)", "hide": 0, "includeAll": false, "label": "Job Id", @@ -1144,8 +1140,8 @@ "name": "JobId", "options": [], "query": { - "query": "label_values(ib_port_physical_state{cluster=~\"$Cluster\"}, job_id)", - "refId": "StandardVariableQuery" + "query": "label_values(ib_port_physical_state{cluster=\"$Cluster\"},job_id)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", @@ -1156,23 +1152,23 @@ { "current": { "selected": false, - "text": "yangwa0ae0000cn", - "value": "yangwa0ae0000cn" + "text": "none", + "value": "none" }, "datasource": { "type": "prometheus", "uid": "moneo-amw" }, - "definition": "label_values(ib_port_physical_state{cluster=~\"$Cluster\", job_id=~\"$JobId\"}, instance)", + "definition": "label_values(ib_port_physical_state{cluster=\"$Cluster\", job_id=\"$JobId\"},instance)", "hide": 0, "includeAll": false, "label": "Instance", - "multi": true, + "multi": false, "name": "Instance", "options": [], "query": { - "query": "label_values(ib_port_physical_state{cluster=~\"$Cluster\", job_id=~\"$JobId\"}, instance)", - "refId": "StandardVariableQuery" + "query": "label_values(ib_port_physical_state{cluster=\"$Cluster\", job_id=\"$JobId\"},instance)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", @@ -1231,7 +1227,6 @@ "timezone": "utc", "title": "Network View", "uid": "IziFPI8Vk", - "version": 11, + "version": 16, "weekStart": "" } - diff --git a/deploy_managed_infra/grafana_dashboard_templates/Node_View.json b/deploy_managed_infra/grafana_dashboard_templates/Node_View.json index cf07077..a9f96a7 100755 --- a/deploy_managed_infra/grafana_dashboard_templates/Node_View.json +++ b/deploy_managed_infra/grafana_dashboard_templates/Node_View.json @@ -274,7 +274,7 @@ "type": "timeseries" }, { - "collapsed": false, + "collapsed": true, "gridPos": { "h": 1, "w": 24, @@ -282,448 +282,446 @@ "y": 9 }, "id": 4, - "panels": [], - "title": "Memory Counters", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "moneo-amw" - }, - "description": "Memory Utilization", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "moneo-amw" }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "description": "Memory Utilization", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "max", + "last" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "single", + "sort": "none" } }, - "decimals": 2, - "mappings": [], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "moneo-amw" }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 10 - }, - "id": 9, - "options": { - "legend": { - "calcs": [ - "max", - "last" + "editorMode": "code", + "exemplar": false, + "expr": "node_mem_util{subscription=~\"$Subscription\", cluster=~\"$Cluster\", instance=~\"$Instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } ], - "displayMode": "table", - "placement": "right", - "showLegend": true + "title": "Memory Utilization", + "transparent": true, + "type": "timeseries" }, - "timezone": [ - "utc" - ], - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ { "datasource": { "type": "prometheus", "uid": "moneo-amw" }, - "editorMode": "code", - "exemplar": false, - "expr": "node_mem_util{subscription=~\"$Subscription\", cluster=~\"$Cluster\", instance=~\"$Instance\"}", - "format": "time_series", - "instant": false, - "interval": "", - "legendFormat": "{{instance}}", - "range": true, - "refId": "A" - } - ], - "title": "Memory Utilization", - "transparent": true, - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "moneo-amw" - }, - "description": "Memory Utilization", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "description": "Memory Utilization", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "max", + "last" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "single", + "sort": "none" } }, - "decimals": 2, - "mappings": [], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "moneo-amw" }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 10 - }, - "id": 12, - "options": { - "legend": { - "calcs": [ - "max", - "last" + "editorMode": "code", + "exemplar": false, + "expr": "node_mem_util{subscription=~\"$Subscription\", cluster=~\"$Cluster\", instance=~\"$Instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } ], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "timezone": [ - "utc" - ], - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "moneo-amw" - }, - "editorMode": "code", - "exemplar": false, - "expr": "node_mem_util{subscription=~\"$Subscription\", cluster=~\"$Cluster\", instance=~\"$Instance\"}", - "format": "time_series", - "instant": false, - "interval": "", - "legendFormat": "{{instance}}", - "range": true, - "refId": "A" + "title": "Memory Utilization", + "transparent": true, + "type": "timeseries" } ], - "title": "Memory Utilization", - "transparent": true, - "type": "timeseries" + "title": "Memory Counters", + "type": "row" }, { - "collapsed": false, + "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 18 + "y": 10 }, "id": 6, - "panels": [], - "title": "Network Counters", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "moneo-amw" - }, - "description": "TX Rate of VM's Ethernet Interface", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "moneo-amw" }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "description": "TX Rate of VM's Ethernet Interface", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "max", + "last" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "single", + "sort": "none" } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "moneo-amw" }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "Bps" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 19 - }, - "id": 11, - "options": { - "legend": { - "calcs": [ - "max", - "last" + "editorMode": "code", + "exemplar": false, + "expr": "node_net_tx{subscription=~\"$Subscription\", cluster=~\"$Cluster\", instance=~\"$Instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } ], - "displayMode": "table", - "placement": "right", - "showLegend": true + "title": "Ethernet TX Rate", + "transparent": true, + "type": "timeseries" }, - "timezone": [ - "utc" - ], - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ { "datasource": { "type": "prometheus", "uid": "moneo-amw" }, - "editorMode": "code", - "exemplar": false, - "expr": "node_net_tx{subscription=~\"$Subscription\", cluster=~\"$Cluster\", instance=~\"$Instance\"}", - "format": "time_series", - "instant": false, - "interval": "", - "legendFormat": "{{instance}}", - "range": true, - "refId": "A" - } - ], - "title": "Ethernet TX Rate", - "transparent": true, - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "moneo-amw" - }, - "description": "RX Rate of VM's Ethernet Interface", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "description": "RX Rate of VM's Ethernet Interface", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "max", + "last" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "single", + "sort": "none" } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "moneo-amw" }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "Bps" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 19 - }, - "id": 13, - "options": { - "legend": { - "calcs": [ - "max", - "last" + "editorMode": "code", + "exemplar": false, + "expr": "node_net_rx{subscription=~\"$Subscription\", cluster=~\"$Cluster\", instance=~\"$Instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } ], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "timezone": [ - "utc" - ], - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "moneo-amw" - }, - "editorMode": "code", - "exemplar": false, - "expr": "node_net_rx{subscription=~\"$Subscription\", cluster=~\"$Cluster\", instance=~\"$Instance\"}", - "format": "time_series", - "instant": false, - "interval": "", - "legendFormat": "{{instance}}", - "range": true, - "refId": "A" + "title": "Ethernet RX Rate", + "transparent": true, + "type": "timeseries" } ], - "title": "Ethernet RX Rate", - "transparent": true, - "type": "timeseries" + "title": "Network Counters", + "type": "row" }, { "collapsed": false, @@ -731,7 +729,7 @@ "h": 1, "w": 24, "x": 0, - "y": 27 + "y": 11 }, "id": 15, "panels": [], @@ -760,7 +758,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -775,10 +774,11 @@ "h": 8, "w": 12, "x": 0, - "y": 28 + "y": 12 }, "id": 17, "options": { + "cellHeight": "sm", "footer": { "countRows": false, "fields": "", @@ -789,7 +789,7 @@ }, "showHeader": true }, - "pluginVersion": "9.4.12", + "pluginVersion": "9.5.13", "targets": [ { "datasource": { @@ -868,7 +868,7 @@ "type": "table" } ], - "refresh": "1m", + "refresh": "", "revision": 1, "schemaVersion": 38, "style": "dark", @@ -887,7 +887,7 @@ "type": "prometheus", "uid": "moneo-amw" }, - "definition": "label_values(node_mem_util, subscription)", + "definition": "label_values(up,subscription)", "hide": 0, "includeAll": false, "label": "Subscription", @@ -895,10 +895,10 @@ "name": "Subscription", "options": [], "query": { - "query": "label_values(node_mem_util, subscription)", - "refId": "StandardVariableQuery" + "query": "label_values(up,subscription)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" }, - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, @@ -907,14 +907,14 @@ { "current": { "selected": false, - "text": "yangwang1-integration-vmss", - "value": "yangwang1-integration-vmss" + "text": "none", + "value": "none" }, "datasource": { "type": "prometheus", "uid": "moneo-amw" }, - "definition": "label_values(node_mem_util{subscription=\"$Subscription\"}, cluster)", + "definition": "label_values(up{subscription=\"$Subscription\"},cluster)", "hide": 0, "includeAll": false, "label": "Cluster", @@ -922,8 +922,8 @@ "name": "Cluster", "options": [], "query": { - "query": "label_values(node_mem_util{subscription=\"$Subscription\"}, cluster)", - "refId": "StandardVariableQuery" + "query": "label_values(up{subscription=\"$Subscription\"},cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", @@ -933,15 +933,16 @@ }, { "current": { + "isNone": true, "selected": false, - "text": "none", - "value": "none" + "text": "None", + "value": "" }, "datasource": { "type": "prometheus", "uid": "moneo-amw" }, - "definition": "label_values(node_mem_util{cluster=\"$Cluster\"}, job_id)", + "definition": "label_values(up{cluster=\"$Cluster\"},job_id)", "description": "", "hide": 0, "includeAll": false, @@ -950,8 +951,8 @@ "name": "JobId", "options": [], "query": { - "query": "label_values(node_mem_util{cluster=\"$Cluster\"}, job_id)", - "refId": "StandardVariableQuery" + "query": "label_values(up{cluster=\"$Cluster\"},job_id)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", @@ -962,14 +963,14 @@ { "current": { "selected": false, - "text": "yangwa0ae0000cn", - "value": "yangwa0ae0000cn" + "text": "none", + "value": "none" }, "datasource": { "type": "prometheus", "uid": "moneo-amw" }, - "definition": "label_values(node_mem_util{cluster=\"$Cluster\", job_id=\"$JobId\"}, instance)", + "definition": "label_values(up{cluster=\"$Cluster\", job_id=\"$JobId\"},instance)", "hide": 0, "includeAll": false, "label": "Instance", @@ -977,8 +978,8 @@ "name": "Instance", "options": [], "query": { - "query": "label_values(node_mem_util{cluster=\"$Cluster\", job_id=\"$JobId\"}, instance)", - "refId": "StandardVariableQuery" + "query": "label_values(up{cluster=\"$Cluster\", job_id=\"$JobId\"},instance)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", @@ -1005,7 +1006,7 @@ "options": [], "query": { "query": "label_values(node_cpu_util{instance=~\"$Instance\"},numa_domain)", - "refId": "StandardVariableQuery" + "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", @@ -1033,7 +1034,6 @@ "timezone": "utc", "title": "Node View", "uid": "DBUc8IU4k", - "version": 16, + "version": 19, "weekStart": "" } - diff --git a/deploy_managed_infra/managed_infra_parameters.json b/deploy_managed_infra/managed_infra_parameters.json index 29654fc..866ac05 100644 --- a/deploy_managed_infra/managed_infra_parameters.json +++ b/deploy_managed_infra/managed_infra_parameters.json @@ -24,4 +24,4 @@ "value": null } } -} \ No newline at end of file +} diff --git a/deploy_managed_infra/managed_infra_template.json b/deploy_managed_infra/managed_infra_template.json index b512118..768c1b9 100644 --- a/deploy_managed_infra/managed_infra_template.json +++ b/deploy_managed_infra/managed_infra_template.json @@ -342,4 +342,4 @@ } } ] -} \ No newline at end of file +} diff --git a/docs/HeadlessDeployment.md b/docs/HeadlessDeployment.md index 43ff97a..1e25892 100644 --- a/docs/HeadlessDeployment.md +++ b/docs/HeadlessDeployment.md @@ -14,18 +14,7 @@ Follow steps outlined in [Infrastructure deployment](../deploy_managed_infra/REA ## Deploy Moneo ## -1. Modify the managed prometheus config file in `Moneo/src/worker/publisher/config/managed_prom_config.json`. - - Reference the user managed identity created during infrastructure deployment to get the "identity client id" - - Reference the Managed Prometheus resource created during infrastructure deployment to get the "metrics ingestion endpoint" - - The config file modifcations must be distributed to the Moneo directories on all workers. - ```json - { - "IDENTITY_CLIENT_ID": "", - "INGESTION_ENDPOINT": "" - } - ``` - -2. Assign the identity to your VMSS resource: +1. Assign the identity to your VMSS resource: - This can either be done via the portal or AZ CLI (below) - During VMSS creation: @@ -39,15 +28,35 @@ Follow steps outlined in [Infrastructure deployment](../deploy_managed_infra/REA az vmss identity assign -g -n --identities ``` -3. Start Services (Assumes Azure marketplace AI/HPC Image): ``` parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/start_moneo_services.sh true" ``` +2. You may choose to deploy Moneo services using [moneo service deploy script](../linux_service/moneo_service_deploy.sh). Other wise skip this step. + 1. Modify the following ENV variables with the appropriate data: + - IDENTITY_CLIENT_ID: This will be the client ID of the user managed identity + - INGESTION_ENDPOINT: This will be the URL to the ingestion endpoint + 2. Run the deploy script ```sudo ./moneo_service_deploy.sh```. This will install, configure, and start Moneo services. + 3. Skip to step 5. + Note: This step can be performed in parallel using pssh. Reference step 4 for start and stop commands. + +3. Modify the managed prometheus config file in `Moneo/src/worker/publisher/config/managed_prom_config.json`. + - Reference the user managed identity created during infrastructure deployment to get the "identity client id" + - Reference the Managed Prometheus resource created during infrastructure deployment to get the "metrics ingestion endpoint" + - The config file modifcations must be distributed to the Moneo directories on all workers. + + ```json + { + "IDENTITY_CLIENT_ID": "", + "INGESTION_ENDPOINT": "" + } + ``` + +4. Start Services (Assumes Azure marketplace AI/HPC Image): ``` parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/start_moneo_services.sh true" ``` - To stop services: ```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/stop_moneo_services.sh"``` Note: If not using Azure AI/HPC market place image reference the ["Deploying Linux services guide"](../linux_service/README.md) for full instructions. -4. At this point data collection should be on going and metrics streaming to the Azure managed Grafana setup during infrastructure. +5. At this point data collection should be on going and metrics streaming to the Azure managed Grafana setup during infrastructure. Note: In the infrastructure deployment step you have the option to use provided template dashboards or create your own. -5. Check with Azure Grafana Dashboards to verify that the metrics are being ingested. +6. Check with Azure Grafana Dashboards to verify that the metrics are being ingested. ![image](assets/azuregrafana-managed_prometheus.png) diff --git a/linux_service/moneo_service_deploy.sh b/linux_service/moneo_service_deploy.sh new file mode 100755 index 0000000..65f6439 --- /dev/null +++ b/linux_service/moneo_service_deploy.sh @@ -0,0 +1,48 @@ +#!/bin/bash +######################################################## +# This script will configure, install, and launch Moneo services +# with Azure Managed Prometheus Remote Write. +# This script will install the specified release version in the +# specified directory below +######################################################## + +MONEO_VERSION=v0.3.4 # Release tag +MONITOR_DIR=/opt/azurehpc/tools # install directory +IDENTITY_CLIENT_ID="38b84eb5-8aec-4971-aaeb-ddd7e9bfef98" # This is the client ID of the Managed Identity for the Azure Prometheus Monitor Workspace +INGESTION_ENDPOINT="https://moneo-amw-q14z.southcentralus-1.metrics.ingest.monitor.azure.com/dataCollectionRules/dcr-c0192b4cd2c748f88ffd422e7a0d77ac/streams/Microsoft-PrometheusMetrics/api/v1/write?api-version=2023-04-24" # This is the ingestion endpoint for the Azure Prometheus Monitor Workspace +MONEO_PATH=$MONITOR_DIR/Moneo +PublisherMethod="" # This is the publisher method for Moneo. Options are azure_monitor, geneva (Msft internal Use), or leave blank for Azure Managed Prometheus + +# clone source to specified directory +if [[ -d "$MONEO_PATH" ]]; then + pushd $MONEO_PATH + git config --global --add safe.directory /opt/azurehpc/tools/Moneo + current_release=$(git describe --tags) + popd + if [[ "$current_release" != "$MONEO_VERSION" ]]; then + pushd $MONITOR_DIR + echo "Moneo Found but not at Release $MONEO_VERSION. Cloning Moneo $MONEO_VERSION." + rm -rf Moneo + git clone https://github.com/Azure/Moneo --branch $MONEO_VERSION + popd + fi +else + pushd $MONITOR_DIR + echo "Cloning Moneo." + git clone https://github.com/Azure/Moneo --branch $MONEO_VERSION + popd +fi + +sudo chmod -R 777 $MONEO_PATH + +# Configure step +echo "{ + \"IDENTITY_CLIENT_ID\": \"$IDENTITY_CLIENT_ID\", + \"INGESTION_ENDPOINT\": \"$INGESTION_ENDPOINT\" }" > $MONEO_PATH/src/worker/publisher/config/managed_prom_config.json + +pushd $MONEO_PATH/linux_service + sudo ./configure_service.sh >> moneoServiceInstall.log + echo "Moneo install complete" + # Start Moneo services + sudo ./start_moneo_services.sh $PublisherMethod +popd diff --git a/linux_service/start_moneo_services.sh b/linux_service/start_moneo_services.sh index b5a35c3..fc12fd0 100755 --- a/linux_service/start_moneo_services.sh +++ b/linux_service/start_moneo_services.sh @@ -48,7 +48,7 @@ function proc_check(){ if [[ -n $WITH_MANAGED_PROM && $WITH_MANAGED_PROM = true ]]; then - if [[ $(docker ps -a | grep prometheus) ]] ; then + if [[ $(sudo docker ps -a | grep prometheus) ]] ; then echo "Prometheus docker containers running." else echo "Prometheus failed to start. Please ensure you have the proper user managed identity assigned to your VMSS/VM." @@ -61,24 +61,24 @@ function proc_check(){ $MONEO_PATH/linux_service/moneo_prestart.sh $MONEO_PATH 2> /dev/null -systemctl enable moneo@node_exporter.service -systemctl enable moneo@net_exporter.service -systemctl enable moneo@nvidia_exporter.service +sudo systemctl enable moneo@node_exporter.service +sudo systemctl enable moneo@net_exporter.service +sudo systemctl enable moneo@nvidia_exporter.service -systemctl start moneo@node_exporter.service -systemctl start moneo@net_exporter.service -systemctl start moneo@nvidia_exporter.service +sudo systemctl start moneo@node_exporter.service +sudo systemctl start moneo@net_exporter.service +sudo systemctl start moneo@nvidia_exporter.service if [[ -n $PublisherMethod ]]; then if [ "$PublisherMethod" == "geneva" ]; then - $MONEO_PATH/src/worker/start_geneva.sh $PUBLISHER_AUTH /tmp/moneo-worker/publisher/config + sudo $MONEO_PATH/src/worker/start_geneva.sh $PUBLISHER_AUTH /tmp/moneo-worker/publisher/config fi - sleep 5 # wait a bit for the exporters to start - systemctl enable moneo_publisher.service - systemctl start moneo_publisher.service + sleep 10 # wait a bit for the exporters to start + sudo systemctl enable moneo_publisher.service + sudo systemctl start moneo_publisher.service proc_check false else - $MONEO_PATH/src/worker/start_managed_prometheus.sh 2> /dev/null - sleep 5 + sudo $MONEO_PATH/src/worker/start_managed_prometheus.sh 2> /dev/null + sleep 10 # wait a bit for the exporters to start proc_check true fi diff --git a/linux_service/stop_moneo_services.sh b/linux_service/stop_moneo_services.sh index c56ef8b..be1e49c 100755 --- a/linux_service/stop_moneo_services.sh +++ b/linux_service/stop_moneo_services.sh @@ -1,21 +1,22 @@ #!/bin/bash +echo "Stopping Exporters Services" -systemctl stop moneo@node_exporter.service -systemctl stop moneo@net_exporter.service -systemctl stop moneo@nvidia_exporter.service -systemctl stop moneo_publisher.service +sudo systemctl stop moneo@node_exporter.service 2> /dev/null +sudo systemctl stop moneo@net_exporter.service 2> /dev/null +sudo systemctl stop moneo@nvidia_exporter.service 2> /dev/null +sudo systemctl stop moneo_publisher.service 2> /dev/null -systemctl disable moneo@node_exporter.service -systemctl disable moneo@net_exporter.service -systemctl disable moneo@nvidia_exporter.service -systemctl disable moneo_publisher.service +sudo systemctl disable moneo@node_exporter.service 2> /dev/null +sudo systemctl disable moneo@net_exporter.service 2> /dev/null +sudo systemctl disable moneo@nvidia_exporter.service 2> /dev/null +sudo systemctl disable moneo_publisher.service 2> /dev/null -if [[ $(docker ps -a | grep prometheus) ]]; then +if [[ $(sudo docker ps -a | grep prometheus) ]]; then echo "Stopping Prometheus containers" - docker stop prometheus genevamdmagent - docker rm prometheus genevamdmagent -elif [[ $(docker ps -a | grep genevamdmagent) ]]; then - docker stop genevamdmagent - docker rm genevamdmagent - + sudo docker stop prometheus genevamdmagent 2> /dev/null + sudo docker rm prometheus genevamdmagent 2> /dev/null +elif [[ $(sudo docker ps -a | grep genevamdmagent) ]]; then + echo "Stopping Geneva containers" + sudo docker stop genevamdmagent 2> /dev/null + sudo docker rm genevamdmagent 2> /dev/null fi diff --git a/moneo.py b/moneo.py index 0045690..6681acf 100644 --- a/moneo.py +++ b/moneo.py @@ -179,9 +179,8 @@ def deploy_worker(self, hosts_file, max_threads=16): # noqa: C901 else: cmd = cmd + ' false' cmd = cmd + " \"\"" - # gpu sample rate - cmd = cmd + " " + str(args.gpu_sample_rate) - print(cmd) + # gpu sample rate + ethernet device + cmd = cmd + " " + str(args.gpu_sample_rate) + " " + args.ethernet_device if self.args.custom_metrics_file_path: print('-Custom exporter enabled-') logging.info('Custom exporter enabled') @@ -408,8 +407,13 @@ def parallel_ssh_check(): parser.add_argument( '--gpu_sample_rate', type=int, - choices=[1, 2, 3, 10], - help='Number of samples per minute for GPU monitoring. Valid options are 1,2,3,10', default=2) + choices=[ 1, 2, 30, 60, 120, 600], + help='Number of samples per minute for GPU monitoring. Valid options are 1,2,3,10', default=60) + parser.add_argument( + '--ethernet_device', + type=str, + default='eth0', + help='The name of the ethernet device to use for network monitoring. Default is eth0') args = parser.parse_args() diff --git a/src/worker/exporters/node_exporter.py b/src/worker/exporters/node_exporter.py index ff9716b..0eb2509 100644 --- a/src/worker/exporters/node_exporter.py +++ b/src/worker/exporters/node_exporter.py @@ -52,6 +52,7 @@ def shell_cmd(cmd, timeout): except subprocess.TimeoutExpired: child.kill() print("Command " + " ".join(args) + ", Failed on timeout") + logging.error("Command " + " ".join(args) + ", Failed on timeout") result = 'TimeOut' return result return result.decode() @@ -163,7 +164,7 @@ def collect(self, field_name): # noqa: C901 value[hca] = [] value[hca].append(timestamp) except Exception as e: - logging.error('Raised exception. Message: %s', e) + logging.exception('Exception occured during collection. Message: %s', e) pass else: value = 0 @@ -203,7 +204,7 @@ def handle_field(self, field_name, value): # noqa: C901 self.config['counter'][field_name][dev_id].append(time_stamp) self.config['sample_timestamp'][field_name] = event_time except Exception as e: - logging.error('Raised exception. Message: %s', e) + logging.exception('Raised exception during xid/linkflap handling. Message: %s', e) pass else: self.update_field(field_name, value, self.config['job_id']) @@ -277,7 +278,7 @@ def get_core_numa_mapping(core_count): # you will need to initialize your custom metric's file if we are exporting # from a file you may also want to initialize the config's counter member # for the specific field -def init_config(job_id, port=None): +def init_config(job_id, port=None, ethernet_device='eth0'): '''Example of config initialization''' global config if not port: @@ -290,7 +291,8 @@ def init_config(job_id, port=None): 'job_id': job_id, 'fieldFiles': {}, 'counter': {}, - 'sample_timestamp': {} + 'sample_timestamp': {}, + 'ethernet_device': ethernet_device } # for xid and link flaps config['command'] = {} @@ -385,7 +387,7 @@ def init_ib_config(): IB_Mapping[mapping] = ib.strip() + ':1' FIELD_LIST.append('link_flap') except Exception as e: - print(e) + logging.exception('Exception occured during configuration. Message: %s', e) pass @@ -411,7 +413,7 @@ def init_nvidia_config(): config['counter']['xid_error'][pci] = [] FIELD_LIST.append('xid_error') except Exception as e: - print(e) + logging.exception('Exception occured during configuration. Message: %s', e) pass @@ -435,6 +437,12 @@ def main(): type=int, default=None, help='Port to export metrics from') + parser.add_argument( + "-e", + "--ethernet_device", + type=str, + default='eth0', + help='Ethernet device to monitor') args = parser.parse_args() # set up logging os.makedirs('/tmp/moneo-worker', exist_ok=True) @@ -442,12 +450,12 @@ def main(): format='[%(asctime)s] node_exporter-%(levelname)s-%(message)s') jobId = None # set a default job id of None try: - init_config(jobId, args.port) + init_config(jobId, args.port, args.ethernet_device) init_signal_handler() exporter = NodeExporter(FIELD_LIST, config) exporter.loop() except Exception as e: - logging.error('Raised exception. Message: %s', e) + logging.exception('Exception occured during configuration. Message: %s', e) if __name__ == '__main__': diff --git a/src/worker/exporters/nvidia_exporter.py b/src/worker/exporters/nvidia_exporter.py index c90901f..3246228 100644 --- a/src/worker/exporters/nvidia_exporter.py +++ b/src/worker/exporters/nvidia_exporter.py @@ -134,14 +134,14 @@ def __init__(self): self, fieldIds=dcgm_config['publishFieldIds'], ignoreList=dcgm_config['ignoreList'], - updateFrequency=int(1000000 / dcgm_config['prometheusPublishInterval']), + updateFrequency=int(60000000 / dcgm_config['prometheusPublishInterval']), maxKeepAge=1800.0, fieldGroupName='dcgm_exporter_{}'.format(os.getpid()), hostname=dcgm_config['dcgmHostName'], ) logging.info( - 'DCGM sample interval: {} microseconds' - .format(int(1000000 / dcgm_config['prometheusPublishInterval']))) + 'DCGM sample interval: {} per minute' + .format(dcgm_config['prometheusPublishInterval'])) self.InitConnection() self.InitGauges() signal.signal(signal.SIGUSR1, self.jobID_update_flag) @@ -268,7 +268,7 @@ def Loop(self): if (job_update): self.jobID_update() self.Process() - time.sleep(0.1) + time.sleep(60 / dcgm_config['prometheusPublishInterval']) if dcgm_config['exit']: logging.info('Received exit signal, shutting down ...') break @@ -314,9 +314,9 @@ def parse_dcgm_cli(): '-s', '--sample_per_min', type=int, - default=2, - choices=[1, 2, 3, 10], - help='Samples per minute. Default 2') + default=60, + choices=[ 1, 2, 30, 60, 120, 600], + help='Samples per minute. Default 60') args = dcgm_client_cli_parser.run_parser(parser) # add profiling metrics if flag enabled if (args.profiler_metrics): diff --git a/src/worker/install/azure_monitor.sh b/src/worker/install/azure_monitor.sh index 7238add..99a1f70 100755 --- a/src/worker/install/azure_monitor.sh +++ b/src/worker/install/azure_monitor.sh @@ -1,4 +1,4 @@ #!/bin/bash # install opentelemetry and Azure monitor -python3 -m pip install opentelemetry-sdk==1.15.0 azure-monitor-opentelemetry --pre +python3 -m pip -qqq install opentelemetry-sdk==1.15.0 azure-monitor-opentelemetry --pre diff --git a/src/worker/install/common.sh b/src/worker/install/common.sh index 6a18abc..242c74b 100644 --- a/src/worker/install/common.sh +++ b/src/worker/install/common.sh @@ -14,4 +14,4 @@ else fi command -v pip3 >/dev/null 2>&1 || python3 <(curl -s https://bootstrap.pypa.io/get-pip.py) -python3 -m pip install prometheus_client psutil filelock +python3 -m pip -qqq install prometheus_client psutil filelock diff --git a/src/worker/install/install.sh b/src/worker/install/install.sh index 785ed8f..f33ef9a 100755 --- a/src/worker/install/install.sh +++ b/src/worker/install/install.sh @@ -14,7 +14,7 @@ else fi # uninstall to deal with Azure monitor and Geneva differences -python3 -m pip uninstall \ +python3 -m pip -qqq uninstall \ azure-monitor-opentelemetry-exporter \ opentelemetry-instrumentation \ opentelemetry-api \ @@ -35,7 +35,7 @@ then if [ $PUBLISHER_INSTALL == 'geneva' ]; then # Install open telemetry related packages - python3 -m pip install opentelemetry-sdk opentelemetry-exporter-otlp + python3 -m pip -qqq install opentelemetry-sdk opentelemetry-exporter-otlp # Pull Geneva Metrics Extension(MA) docker image docker pull linuxgeneva-microsoft.azurecr.io/genevamdm:$MDM_DOCKER_VERSION diff --git a/src/worker/start.sh b/src/worker/start.sh index d427f01..555781d 100755 --- a/src/worker/start.sh +++ b/src/worker/start.sh @@ -10,7 +10,9 @@ PUBLISHER_AUTH=${3:-""} GPU_SAMPLE_RATE=$4 -CUTSOM_METRICS_PATH=${5:-""} +ETH_DEV=${5:-""} + +CUTSOM_METRICS_PATH=${6:-""} #shutdown previous instances $WORK_DIR/shutdown.sh false @@ -36,7 +38,7 @@ then fi nohup python3 $WORK_DIR/exporters/net_exporter.py /dev/null 2>&1 & -nohup python3 $WORK_DIR/exporters/node_exporter.py /dev/null 2>&1 & +nohup python3 $WORK_DIR/exporters/node_exporter.py -e $ETH_DEV /dev/null 2>&1 & if [[ -n "$CUTSOM_METRICS_PATH" ]] then nohup python3 $WORK_DIR/exporters/custom_exporter.py --custom_metrics_file_path $CUTSOM_METRICS_PATH /dev/null 2>&1 &