diff --git a/cmd/collectors/rest/plugins/health/health.go b/cmd/collectors/rest/plugins/health/health.go index 3aaedce27..df057d065 100644 --- a/cmd/collectors/rest/plugins/health/health.go +++ b/cmd/collectors/rest/plugins/health/health.go @@ -1,6 +1,7 @@ package health import ( + "errors" "fmt" "github.com/netapp/harvest/v2/cmd/collectors" "github.com/netapp/harvest/v2/cmd/poller/plugin" @@ -14,6 +15,7 @@ import ( "github.com/tidwall/gjson" "log/slog" "strconv" + "strings" "time" ) @@ -33,6 +35,7 @@ const ( volumeRansomwareHealthMatrix = "health_volume_ransomware" volumeMoveHealthMatrix = "health_volume_move" licenseHealthMatrix = "health_license" + emsHealthMatrix = "health_ems" severityLabel = "severity" defaultDataPollDuration = 3 * time.Minute ) @@ -44,6 +47,7 @@ type Health struct { lastFilterTime int64 previousData map[string]*matrix.Matrix resolutionData map[string]*matrix.Matrix + emsSeverity []string } func New(p *plugin.AbstractPlugin) plugin.Plugin { @@ -66,6 +70,20 @@ func (h *Health) Init() error { return err } + ems := h.Params.GetChildS("ems") + + // Set default severity to "emergency" + h.emsSeverity = []string{"emergency"} + if ems != nil { + severity := ems.GetChildS("severity") + if severity != nil { + severities := severity.GetAllChildContentS() + if len(severities) > 0 { + h.emsSeverity = severities + } + } + } + timeout, _ := time.ParseDuration(rest.DefaultTimeout) if h.client, err = rest.New(conf.ZapiPoller(h.ParentParams), timeout, h.Auth); err != nil { return err @@ -147,6 +165,14 @@ func (h *Health) Run(dataMap map[string]*matrix.Matrix) ([]*matrix.Matrix, *util h.resolutionData[k].SetGlobalLabels(data.GetGlobalLabels()) } + // Initialize emsMatrix separately as it doesn't need to be stored or processed for resolution + emsMat := matrix.New(h.Parent+emsHealthMatrix, emsHealthMatrix, emsHealthMatrix) + emsMat.SetGlobalLabels(data.GetGlobalLabels()) + if err := h.initMatrix(emsHealthMatrix, "", map[string]*matrix.Matrix{emsHealthMatrix: emsMat}); err != nil { + h.SLogger.Warn("error while initializing emsHealthMatrix", slogx.Err(err)) + return nil, nil, err + } + diskAlertCount := h.collectDiskAlerts() shelfAlertCount := h.collectShelfAlerts() supportAlertCount := h.collectSupportAlerts() @@ -158,6 +184,7 @@ func (h *Health) Run(dataMap map[string]*matrix.Matrix) ([]*matrix.Matrix, *util volumeRansomwareAlertCount := h.collectVolumeRansomwareAlerts() volumeMoveAlertCount := h.collectVolumeMoveAlerts() licenseAlertCount := h.collectLicenseAlerts() + emsAlertCount := h.collectEmsAlerts(emsMat) resolutionInstancesCount := h.generateResolutionMetrics() @@ -170,6 +197,8 @@ func (h *Health) Run(dataMap map[string]*matrix.Matrix) ([]*matrix.Matrix, *util for _, value := range h.resolutionData { result = append(result, value) } + + result = append(result, emsMat) h.SLogger.Info( "Collected", slog.Int("numLicenseAlerts", licenseAlertCount), @@ -183,12 +212,13 @@ func (h *Health) Run(dataMap map[string]*matrix.Matrix) ([]*matrix.Matrix, *util slog.Int("numSupportAlerts", supportAlertCount), slog.Int("numShelfAlerts", shelfAlertCount), slog.Int("numDiskAlerts", diskAlertCount), + slog.Int("numEmsAlerts", emsAlertCount), slog.Int("numResolutionInstanceCount", resolutionInstancesCount), ) //nolint:gosec h.client.Metadata.PluginInstances = uint64(diskAlertCount + shelfAlertCount + supportAlertCount + nodeAlertCount + HAAlertCount + networkEthernetPortAlertCount + networkFcpPortAlertCount + - networkInterfaceAlertCount + volumeRansomwareAlertCount + volumeMoveAlertCount + licenseAlertCount + resolutionInstancesCount) + networkInterfaceAlertCount + volumeRansomwareAlertCount + volumeMoveAlertCount + licenseAlertCount + emsAlertCount + resolutionInstancesCount) return result, h.client.Metadata, nil } @@ -635,6 +665,51 @@ func (h *Health) collectDiskAlerts() int { return diskAlertCount } +func (h *Health) collectEmsAlerts(emsMat *matrix.Matrix) int { + var ( + instance *matrix.Instance + ) + emsAlertCount := 0 + records, err := h.getEmsAlerts() + if err != nil { + if errs.IsRestErr(err, errs.APINotFound) { + h.SLogger.Debug("API not found", slogx.Err(err)) + } else { + h.SLogger.Error("Failed to collect ems data", slogx.Err(err)) + } + return 0 + } + for _, record := range records { + node := record.Get("node.name").String() + severity := record.Get("message.severity").String() + message := record.Get("message.name").String() + source := record.Get("source").String() + if instance = emsMat.GetInstance(message); instance == nil { + instance, err = emsMat.NewInstance(message) + if err != nil { + h.SLogger.Warn("error while creating instance", slog.String("key", message)) + continue + } + instance.SetLabel("node", node) + instance.SetLabel("message", message) + instance.SetLabel("source", source) + instance.SetLabel(severityLabel, severity) + h.setAlertMetric(emsMat, instance, 1) + emsAlertCount++ + } else { + // Increment the alert metric count by 1 + currentCount, err := h.getAlertMetric(emsMat, instance) + if err != nil { + h.SLogger.Error("Failed to get alert metric", slogx.Err(err)) + continue + } + h.setAlertMetric(emsMat, instance, currentCount+1) + } + } + + return emsAlertCount +} + func (h *Health) getDisks() ([]gjson.Result, error) { fields := []string{"name", "container_type"} query := "api/storage/disks" @@ -761,6 +836,26 @@ func (h *Health) getEthernetPorts() ([]gjson.Result, error) { return collectors.InvokeRestCall(h.client, href, h.SLogger) } +func (h *Health) getEmsAlerts() ([]gjson.Result, error) { + clusterTime, err := collectors.GetClusterTime(h.client, nil, h.SLogger) + if err != nil { + return nil, err + } + fromTime := clusterTime.Add(-24 * time.Hour).Unix() + timeFilter := fmt.Sprintf("time=>=%d", fromTime) + severityFilter := "message.severity=" + strings.Join(h.emsSeverity, "|") + fields := []string{"node,message,source"} + query := "api/support/ems/events" + href := rest.NewHrefBuilder(). + APIPath(query). + Fields(fields). + MaxRecords(collectors.DefaultBatchSize). + Filter([]string{timeFilter, severityFilter}). + Build() + + return collectors.InvokeRestCall(h.client, href, h.SLogger) +} + func (h *Health) getSupportAlerts(filter []string) ([]gjson.Result, error) { query := "api/private/support/alerts" href := rest.NewHrefBuilder(). @@ -813,6 +908,15 @@ func (h *Health) setAlertMetric(mat *matrix.Matrix, instance *matrix.Instance, v } } +func (h *Health) getAlertMetric(mat *matrix.Matrix, instance *matrix.Instance) (float64, error) { + m := mat.GetMetric("alerts") + if m != nil { + v, _ := m.GetValueFloat64(instance) + return v, nil + } + return 0, errors.New("alert metric doesn't exist") +} + func (h *Health) generateResolutionMetrics() int { resolutionInstancesCount := 0 for prevKey, prevMat := range h.previousData { diff --git a/cmd/tools/generate/counter.yaml b/cmd/tools/generate/counter.yaml index 1576d0498..e66326695 100644 --- a/cmd/tools/generate/counter.yaml +++ b/cmd/tools/generate/counter.yaml @@ -1328,6 +1328,15 @@ counters: ONTAPCounter: Harvest generated Template: conf/rest/9.6.0/health.yaml + - Name: health_ems_alerts + Description: The health_ems_alerts metric monitors EMS (Event Management System), providing a count based on their severity and other attributes. This metric includes labels such as node, message, source, and severity (e.g., emergency, alert, error). + By default, it monitors alerts with emergency severity. + APIs: + - API: REST + Endpoint: NA + ONTAPCounter: Harvest generated + Template: conf/rest/9.6.0/health.yaml + - Name: qos_policy_adaptive_absolute_min_iops Description: Specifies the absolute minimum IOPS that is used as an override when the expected_iops is less than this value. APIs: diff --git a/conf/rest/9.6.0/health.yaml b/conf/rest/9.6.0/health.yaml index 41c0fe245..25c3ac3a6 100644 --- a/conf/rest/9.6.0/health.yaml +++ b/conf/rest/9.6.0/health.yaml @@ -7,6 +7,14 @@ counters: - ^name plugins: - - Health + - Health: + # Description: + # This configuration enables the Health plugin to monitor EMS alerts based on specified severities. + # - severity: A list of severities to monitor. Possible values are emergency, alert, error, notice, informational, debug + ems: + severity: + - emergency +# - alert +# - error export_data: false diff --git a/docs/ontap-metrics.md b/docs/ontap-metrics.md index 312773df8..205ac2a1a 100644 --- a/docs/ontap-metrics.md +++ b/docs/ontap-metrics.md @@ -3288,6 +3288,15 @@ Provides any issues related to Disks health check if disks are broken or unassig | REST | `NA` | `Harvest generated` | conf/rest/9.6.0/health.yaml | +### health_ems_alerts + +The health_ems_alerts metric monitors EMS (Event Management System), providing a count based on their severity and other attributes. This metric includes labels such as node, message, source, and severity (e.g., emergency, alert, error). By default, it monitors alerts with emergency severity. + +| API | Endpoint | Metric | Template | +|--------|----------|--------|---------| +| REST | `NA` | `Harvest generated` | conf/rest/9.6.0/health.yaml | + + ### health_ha_alerts Provides any issues related to HA health check. Value of 1 means issue is happening and 0 means that issue is resolved. diff --git a/grafana/dashboards/cmode/health.json b/grafana/dashboards/cmode/health.json index c5b8f8ddf..e6466a135 100644 --- a/grafana/dashboards/cmode/health.json +++ b/grafana/dashboards/cmode/health.json @@ -71,7 +71,7 @@ "gnetId": null, "graphTooltip": 1, "id": null, - "iteration": 1718007936255, + "iteration": 1730271272394, "links": [ { "asDropdown": true, @@ -106,14 +106,14 @@ { "datasource": "${DS_PROMETHEUS}", "gridPos": { - "h": 5, + "h": 3, "w": 24, "x": 0, "y": 1 }, "id": 241, "options": { - "content": "This dashboard requires ONTAP 9.6+ and the REST collector. Two actions are required to use this dashboard:\n1. Enable the REST collector in your harvest.yml config\n2. Enable the EMS collector in your harvest.yml config for EMS events\n\nHarvest only detects Health and EMS events that are generated after the two collectors are enabled. By default, Harvest collects a focused set of [EMS events](https://github.com/NetApp/harvest/blob/main/conf/ems/9.6.0/ems.yaml) and this dashboard only shows the emergency level events.", + "content": "This dashboard is compatible with ONTAP 9.6+ and requires the REST collector. To use this dashboard, please follow the step below:\n\n1. **Enable the REST Collector**: Update your `harvest.yml` configuration file to enable the REST collector.", "mode": "markdown" }, "pluginVersion": "8.1.8", @@ -126,7 +126,7 @@ "h": 1, "w": 24, "x": 0, - "y": 6 + "y": 4 }, "id": 225, "panels": [], @@ -191,7 +191,7 @@ "h": 6, "w": 8, "x": 0, - "y": 7 + "y": 5 }, "id": 277, "options": { @@ -282,7 +282,7 @@ "h": 6, "w": 8, "x": 8, - "y": 7 + "y": 5 }, "id": 278, "options": { @@ -358,7 +358,7 @@ "h": 6, "w": 8, "x": 16, - "y": 7 + "y": 5 }, "id": 270, "options": { @@ -380,14 +380,14 @@ "targets": [ { "exemplar": false, - "expr": "(count(last_over_time(ems_events{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",severity=\"emergency\"}[$__range]) == 1) or vector(0))", + "expr": "sum(health_ems_alerts{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\", severity=\"emergency\"}) or vector(0)", "instant": true, "interval": "", "legendFormat": "", "refId": "A" } ], - "title": "Total Active Emergency EMS", + "title": "Active Emergency EMS Alerts (Last 24 Hours)", "transformations": [], "type": "stat" }, @@ -416,7 +416,7 @@ "h": 16, "w": 8, "x": 0, - "y": 13 + "y": 11 }, "id": 268, "options": { @@ -567,7 +567,7 @@ "h": 16, "w": 8, "x": 8, - "y": 13 + "y": 11 }, "id": 269, "options": { @@ -721,17 +721,23 @@ "h": 16, "w": 8, "x": 16, - "y": 13 + "y": 11 }, "id": 272, "options": { - "showHeader": true + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "EMS" + } + ] }, "pluginVersion": "8.1.8", "targets": [ { "exemplar": false, - "expr": "count(last_over_time(ems_events{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",severity=\"emergency\"}[$__range]) == 1) by (message)", + "expr": "sum(health_ems_alerts{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\", severity=\"emergency\"}) by (message)", "format": "table", "instant": true, "interval": "", @@ -739,7 +745,7 @@ "refId": "A" } ], - "title": "Active Emergency EMS", + "title": "Active Emergency EMS Alerts (Last 24 Hours)", "transformations": [ { "id": "organize", @@ -759,14 +765,14 @@ }, { "collapsed": true, - "datasource": "${DS_PROMETHEUS}", + "datasource": null, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 29 + "y": 27 }, - "id": 251, + "id": 283, "panels": [ { "datasource": "${DS_PROMETHEUS}", @@ -815,6 +821,28 @@ } ] }, + { + "matcher": { + "id": "byName", + "options": "Takeover Possible" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "false": { + "index": 0, + "text": "No" + } + }, + "type": "value" + } + ] + } + ] + }, { "matcher": { "id": "byName", @@ -880,6 +908,28 @@ ] } ] + }, + { + "matcher": { + "id": "byName", + "options": "partner" + }, + "properties": [ + { + "id": "displayName", + "value": "Partner Node" + }, + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "", + "url": "/d/cdot-node/ontap-node?orgId=1&${Datacenter:queryparam}&${Cluster:queryparam}&${__url_time_range}&var-Node=${__value.raw}" + } + ] + } + ] } ] }, @@ -887,9 +937,9 @@ "h": 6, "w": 24, "x": 0, - "y": 28 + "y": 29 }, - "id": 253, + "id": 281, "options": { "showHeader": true }, @@ -897,7 +947,7 @@ "targets": [ { "exemplar": false, - "expr": "node_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} * on(cluster,node,datacenter) group_left(severity) (health_node_alerts{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} == 1)", + "expr": "node_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} * on(cluster,node,datacenter) group_left(severity,takeover_possible,partner,partner_state,state_description) (health_ha_alerts{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} == 1)", "format": "table", "instant": true, "interval": "", @@ -905,7 +955,7 @@ "refId": "A" } ], - "title": "Node Issues", + "title": "HA Issues", "transformations": [ { "id": "organize", @@ -913,26 +963,47 @@ "excludeByName": { "Time": true, "Value": true, + "cpu_firmware_release": true, "instance": true, - "job": true + "job": true, + "max_aggr_size": true, + "max_vol_num": true, + "max_vol_size": true }, "indexByName": { "Time": 0, - "Value": 10, + "Value": 14, "cluster": 2, + "cpu_firmware_release": 15, "datacenter": 1, - "healthy": 4, - "instance": 8, - "job": 9, + "healthy": 5, + "instance": 12, + "job": 13, + "location": 16, + "max_aggr_size": 17, + "max_vol_num": 18, + "max_vol_size": 19, + "model": 20, "node": 3, - "severity": 6, - "state": 5, - "version": 7 + "partner": 7, + "partner_state": 8, + "serial": 21, + "severity": 10, + "state": 6, + "state_description": 9, + "takeover_possible": 4, + "vendor": 22, + "version": 11, + "warnings": 23 }, "renameByName": { + "cpu_firmware_release": "", "healthy": "Healthy", + "partner_state": "Partner State", "severity": "Severity", "state": "State", + "state_description": "State Description", + "takeover_possible": "Takeover Possible", "version": "Version" } } @@ -941,19 +1012,19 @@ "type": "table" } ], - "title": "Node", + "title": "HA", "type": "row" }, { "collapsed": true, - "datasource": null, + "datasource": "${DS_PROMETHEUS}", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 28 }, - "id": 283, + "id": 251, "panels": [ { "datasource": "${DS_PROMETHEUS}", @@ -1002,28 +1073,6 @@ } ] }, - { - "matcher": { - "id": "byName", - "options": "Takeover Possible" - }, - "properties": [ - { - "id": "mappings", - "value": [ - { - "options": { - "false": { - "index": 0, - "text": "No" - } - }, - "type": "value" - } - ] - } - ] - }, { "matcher": { "id": "byName", @@ -1089,28 +1138,6 @@ ] } ] - }, - { - "matcher": { - "id": "byName", - "options": "partner" - }, - "properties": [ - { - "id": "displayName", - "value": "Partner Node" - }, - { - "id": "links", - "value": [ - { - "targetBlank": true, - "title": "", - "url": "/d/cdot-node/ontap-node?orgId=1&${Datacenter:queryparam}&${Cluster:queryparam}&${__url_time_range}&var-Node=${__value.raw}" - } - ] - } - ] } ] }, @@ -1118,9 +1145,9 @@ "h": 6, "w": 24, "x": 0, - "y": 29 + "y": 28 }, - "id": 281, + "id": 253, "options": { "showHeader": true }, @@ -1128,7 +1155,7 @@ "targets": [ { "exemplar": false, - "expr": "node_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} * on(cluster,node,datacenter) group_left(severity,takeover_possible,partner,partner_state,state_description) (health_ha_alerts{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} == 1)", + "expr": "node_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} * on(cluster,node,datacenter) group_left(severity) (health_node_alerts{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} == 1)", "format": "table", "instant": true, "interval": "", @@ -1136,7 +1163,7 @@ "refId": "A" } ], - "title": "HA Issues", + "title": "Node Issues", "transformations": [ { "id": "organize", @@ -1144,47 +1171,26 @@ "excludeByName": { "Time": true, "Value": true, - "cpu_firmware_release": true, "instance": true, - "job": true, - "max_aggr_size": true, - "max_vol_num": true, - "max_vol_size": true + "job": true }, "indexByName": { "Time": 0, - "Value": 14, + "Value": 10, "cluster": 2, - "cpu_firmware_release": 15, "datacenter": 1, - "healthy": 5, - "instance": 12, - "job": 13, - "location": 16, - "max_aggr_size": 17, - "max_vol_num": 18, - "max_vol_size": 19, - "model": 20, + "healthy": 4, + "instance": 8, + "job": 9, "node": 3, - "partner": 7, - "partner_state": 8, - "serial": 21, - "severity": 10, - "state": 6, - "state_description": 9, - "takeover_possible": 4, - "vendor": 22, - "version": 11, - "warnings": 23 + "severity": 6, + "state": 5, + "version": 7 }, "renameByName": { - "cpu_firmware_release": "", "healthy": "Healthy", - "partner_state": "Partner State", "severity": "Severity", "state": "State", - "state_description": "State Description", - "takeover_possible": "Takeover Possible", "version": "Version" } } @@ -1193,7 +1199,7 @@ "type": "table" } ], - "title": "HA", + "title": "Node", "type": "row" }, { @@ -4137,7 +4143,7 @@ "h": 8, "w": 24, "x": 0, - "y": 35 + "y": 37 }, "id": 237, "options": { @@ -4147,7 +4153,7 @@ "targets": [ { "exemplar": false, - "expr": "last_over_time(ems_events{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",severity=\"emergency\"}[$__range]) == 1", + "expr": "sum(health_ems_alerts{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\", severity=\"emergency\"}) by (datacenter,cluster,node,message,severity)", "format": "table", "instant": true, "interval": "", @@ -4155,13 +4161,20 @@ "refId": "A" } ], - "title": "Emergency EMS", + "title": "Active Emergency EMS Alerts (Last 24 Hours)", "transformations": [ { "id": "filterFieldsByName", "options": { "include": { - "pattern": "/^(datacenter|cluster|message|node|severity)$/" + "names": [ + "cluster", + "datacenter", + "message", + "node", + "severity", + "Value" + ] } } }, @@ -4177,6 +4190,7 @@ "severity": 4 }, "renameByName": { + "Value": "Count", "message": "Message", "severity": "Severity" } @@ -4473,11 +4487,11 @@ "options": [ { "selected": true, - "text": "The EMS collector gathers EMS events as defined in your ems.yml file. This panel displays events with emergency severity that occurred within the selected time range.", - "value": "The EMS collector gathers EMS events as defined in your ems.yml file. This panel displays events with emergency severity that occurred within the selected time range." + "text": "This panel displays all emergency EMS alerts active in the past 24 hours.", + "value": "This panel displays all emergency EMS alerts active in the past 24 hours." } ], - "query": "The EMS collector gathers EMS events as defined in your ems.yml file. This panel displays events with emergency severity that occurred within the selected time range.", + "query": "This panel displays all emergency EMS alerts active in the past 24 hours. ", "skipUrlSync": false, "type": "textbox" } @@ -4503,5 +4517,5 @@ "timezone": "", "title": "ONTAP: Health", "uid": "cdot-health", - "version": 4 + "version": 5 }