From 0433bdc01d656f08dd38e37b015f9c0ca1b914df Mon Sep 17 00:00:00 2001 From: Chris Grindstaff Date: Wed, 20 Nov 2024 02:22:03 -0500 Subject: [PATCH] feat: Harvest should monitor `wafl.dir.size.warning` (#3304) * feat: Harvest should monitor `wafl.dir.size.warning` --- conf/ems/9.6.0/ems.yaml | 7 ++ container/prometheus/ems_alert_rules.yml | 141 +++++++++++++---------- docs/resources/ems-alert-runbook.md | 12 ++ integration/test/alert_rule_test.go | 4 +- 4 files changed, 103 insertions(+), 61 deletions(-) diff --git a/conf/ems/9.6.0/ems.yaml b/conf/ems/9.6.0/ems.yaml index c8b1014d4..4ebfe64ce 100644 --- a/conf/ems/9.6.0/ems.yaml +++ b/conf/ems/9.6.0/ems.yaml @@ -944,6 +944,13 @@ events: - parameters.mirror_config_id => mirror_config_id - parameters.primary_config_id => primary_config_id + - name: wafl.dir.size.warning + exports: + - parameters.fileid => directory_inum + - parameters.vol => volume + - parameters.app => app + - parameters.volident => vol_ident + - name: wafl.readdir.expired exports: - parameters.app => app diff --git a/container/prometheus/ems_alert_rules.yml b/container/prometheus/ems_alert_rules.yml index e69d452a4..d5e3085d7 100644 --- a/container/prometheus/ems_alert_rules.yml +++ b/container/prometheus/ems_alert_rules.yml @@ -23,7 +23,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "LUN {{ $labels.lun_path }}, vol {{ $labels.volume_name }} (DSID {{ $labels.volume_dsid }}) destroyed (UUID: {{ $labels.object_uuid }})." + summary: "LUN {{ $labels.lun_path }}, vol {{ $labels.volume }} (DSID {{ $labels.volume_ds_id }}) destroyed (UUID: {{ $labels.object_uuid }})." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#lun-destroyed" @@ -47,7 +47,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "LUN {{ $labels.lun_path }}, vol {{ $labels.volume_name }} (DSID {{ $labels.volume_dsid }}) was brought offline (UUID: {{ $labels.object_uuid }})." + summary: "LUN {{ $labels.lun_path }}, vol {{ $labels.volume }} (DSID {{ $labels.volume_ds_id }}) was brought offline (UUID: {{ $labels.object_uuid }})." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#lun-offline" @@ -71,7 +71,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "NVMe namespace {{ $labels.NVMeNS_path }}, vol {{ $labels.volume_name }} (DSID {{ $labels.volume_dsid }}) was destroyed (UUID: {{ $labels.object_uuid }})." + summary: "NVMe namespace {{ $labels.path }}, vol {{ $labels.volume }} (DSID {{ $labels.volume_ds_id }}) was destroyed (UUID: {{ $labels.object_uuid }})." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#nvme-namespace-destroyed" @@ -95,7 +95,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "NVMe namespace {{ $labels.path }}, vol {{ $labels.volume_name }} (DSID {{ $labels.volume_dsid }}) was brought offline (UUID: {{ $labels.object_uuid }})." + summary: "NVMe namespace {{ $labels.path }}, vol {{ $labels.volume }} (DSID {{ $labels.volume_ds_id }}) was brought offline (UUID: {{ $labels.object_uuid }})." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#nvme-namespace-offline" @@ -119,7 +119,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "NVMe namespace {{ $labels.path }}, vol {{ $labels.volume_name }} (DSID {{ $labels.volume_dsid }}) was brought online (UUID: {{ $labels.object_uuid }})." + summary: "NVMe namespace {{ $labels.path }}, vol {{ $labels.volume }} (DSID {{ $labels.volume_ds_id }}) was brought online (UUID: {{ $labels.object_uuid }})." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#nvme-namespace-online" @@ -143,7 +143,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Many simultaneous new CIFS connections are occurring on Vserver ID {{ $labels.vsId }} from IP address {{ $labels.remoteIpAddress }} object type is {{ $labels.object_type }} with UUID {{ $labels.object_uuid }}." + summary: "Many simultaneous new CIFS connections are occurring on Vserver ID {{ $labels.vs_id }} from IP address {{ $labels.remote_ip_address }} object type is {{ $labels.object_type }} with UUID {{ $labels.object_uuid }}." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#too-many-cifs-authentication" @@ -167,7 +167,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Received too many open file requests for the same file by one user on a connection: clientIP:port {{ $labels.IpAddress }}:{{ $labels.port }}, file \"{{ $labels.filePath }}\" on share \"{{ $labels.shareName }}\", vserver: \"{{ $labels.vserverName }}\". Object type is: {{ $labels.object_type }} with UUID: {{ $labels.object_uuid }}." + summary: "Received too many open file requests for the same file by one user on a connection: clientIP:port {{ $labels.ip_address }}:{{ $labels.port }}, file \"{{ $labels.file_path }}\" on share \"{{ $labels.share }}\", vserver: \"{{ $labels.svm }}\". Object type is: {{ $labels.object_type }} with UUID: {{ $labels.object_uuid }}." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#max-times-open-per-file-exceeded" @@ -191,7 +191,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Received too many session requests from the same user on one TCP connection: clientIP:port {{ $labels.IpAddress }}:{{ $labels.port }}, user \"{{ $labels.userName }}\", vserver: \"{{ $labels.vserverName }}\". Object type is: {{ $labels.object_type }} with UUID: {{ $labels.object_uuid }}." + summary: "Received too many session requests from the same user on one TCP connection: clientIP:port {{ $labels.ip_address }}:{{ $labels.port }}, user \"{{ $labels.user }}\", vserver: \"{{ $labels.svm }}\". Object type is: {{ $labels.object_type }} with UUID: {{ $labels.object_uuid }}." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#max-sessions-per-user-exceeded" @@ -215,7 +215,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "The NetBIOS Name Service received a negative name registration response. The name {{ $labels.nbName }} is owned by a remote machine. The IP address being registered is {{ $labels.IpAddress }}. Object type is: {{ $labels.object_type }} with UUID: {{ $labels.object_uuid }}." + summary: "The NetBIOS Name Service received a negative name registration response. The name {{ $labels.nb }} is owned by a remote machine. The IP address being registered is {{ $labels.ip_address }}. Object type is: {{ $labels.object_type }} with UUID: {{ $labels.object_uuid }}." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#netbios-name-conflict" @@ -239,7 +239,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Vserver ID: {{ $labels.vserverId }}, user name: {{ $labels.userName }}, client ip: {{ $labels.clientIp }}, Object type is: {{ $labels.object_type }} with UUID: {{ $labels.object_uuid }}." + summary: "Vserver ID: {{ $labels.svm_uuid }}, user name: {{ $labels.user }}, client ip: {{ $labels.client_ip }}, Object type is: {{ $labels.object_type }} with UUID: {{ $labels.object_uuid }}." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#nonexistent-admin-share" @@ -263,7 +263,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "NFS Store Pool for {{ $labels.poolname }} exhausted. Associated object type is {{ $labels.object_type }} with UUID: {{ $labels.object_uuid }}." + summary: "NFS Store Pool for {{ $labels.pool }} exhausted. Associated object type is {{ $labels.object_type }} with UUID: {{ $labels.object_uuid }}." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#nfsv4-store-pool-exhausted" @@ -287,7 +287,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "For Vserver \"{{ $labels.vserverName }}\", the attempt to connect to the privileged ONTAP_ADMIN$ share by the client \"{{ $labels.scannerIp }}\" is rejected because its logged-in user \"{{ $labels.userName }}\" is not configured in any of the Vserver active scanner pools." + summary: "For Vserver \"{{ $labels.svm }}\", the attempt to connect to the privileged ONTAP_ADMIN$ share by the client \"{{ $labels.scanner_ip }}\" is rejected because its logged-in user \"{{ $labels.user }}\" is not configured in any of the Vserver active scanner pools." impact: "Security" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#unauthorized-user-access-to-admin-share" @@ -311,7 +311,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "For Vserver \"{{ $labels.vserverName }}\", AV server \"{{ $labels.scannerIp }}\" is too busy to accept new scan requests." + summary: "For Vserver \"{{ $labels.svm }}\", AV server \"{{ $labels.scanner_ip }}\" is too busy to accept new scan requests." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#antivirus-server-busy" @@ -335,7 +335,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "For Vserver \"{{ $labels.vserverName }}\", ONTAP(R) forcibly closed the vscan connection originated from the nonresponsive AV server \"{{ $labels.scannerIp }}\"." + summary: "For Vserver \"{{ $labels.svm }}\", ONTAP(R) forcibly closed the vscan connection originated from the nonresponsive AV server \"{{ $labels.scanner_ip }}\"." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#non-responsive-antivirus-server" @@ -359,7 +359,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "For Vserver \"{{ $labels.vserverName }}\", AV Connector running on the AV server \"{{ $labels.scannerIp }}\" does not have a registered scan-engine to it." + summary: "For Vserver \"{{ $labels.svm }}\", AV Connector running on the AV server \"{{ $labels.scanner_ip }}\" does not have a registered scan-engine to it." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#no-registered-scan-engine" @@ -383,7 +383,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Vserver \"{{ $labels.vserverName }}\" has no virus scanner connection." + summary: "Vserver \"{{ $labels.svm }}\" has no virus scanner connection." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#no-vscan-connection" @@ -407,7 +407,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Possible virus detected. Vserver: {{ $labels.vserverName }}, vscan server IP: {{ $labels.vscanServerIp }}, file path: {{ $labels.filePath }}, client IP: {{ $labels.clientIp }}, SID: {{ $labels.SID }}, vscan engine status: {{ $labels.vscanEngineStatus }}, vscan engine result string: {{ $labels.vscanEngineResultString }}." + summary: "Possible virus detected. Vserver: {{ $labels.svm }}, vscan server IP: {{ $labels.vscan_server_ip }}, file path: {{ $labels.file_path }}, client IP: {{ $labels.client_ip }}, SID: {{ $labels.sid }}, vscan engine status: {{ $labels.vscanEngineStatus }}, vscan engine result string: {{ $labels.vscanEngineResultString }}." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#virus-detected" @@ -431,7 +431,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Relocation of aggregate '{{ $labels.vol }}' (uuid: {{ $labels.aggr_uuid }}) failed due to {{ $labels.reason }} preventing object store access on the destination node." + summary: "Relocation of aggregate '{{ $labels.volume }}' (uuid: {{ $labels.aggr_uuid }}) failed due to {{ $labels.reason }} preventing object store access on the destination node." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#relocation-of-storage-pool-failed" @@ -455,7 +455,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Anti-ransomware state was changed to \"{{ $labels.op }}\" on volume \"{{ $labels.volumeName }}\" (UUID: \"{{ $labels.volumeUuid }}\") in Vserver \"{{ $labels.vserverName }}\" (UUID: \"{{ $labels.vserverUuid }}\")." + summary: "Anti-ransomware state was changed to \"{{ $labels.op }}\" on volume \"{{ $labels.volume }}\" (UUID: \"{{ $labels.volume_uuid }}\") in Vserver \"{{ $labels.svm }}\" (UUID: \"{{ $labels.svm_uuid }}\")." impact: "Security" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#volume-anti-ransomware-monitoring" @@ -479,7 +479,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Anti-ransomware was changed to \"{{ $labels.op }}\" on Vserver \"{{ $labels.vserverName }}\" (UUID: \"{{ $labels.vserverUuid }}\")." + summary: "Anti-ransomware was changed to \"{{ $labels.op }}\" on Vserver \"{{ $labels.svm }}\" (UUID: \"{{ $labels.svm_uuid }}\")." impact: "Security" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#storage-vm-anti-ransomware-monitoring" @@ -623,7 +623,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "A shadow copy operation has failed: {{ $labels.errMsg }}. ( Operation : {{ $labels.operation }} , Client Shadow Copy Set ID : {{ $labels.clientShadowCopySetId }} , Filer Shadow Copy Set ID : {{ $labels.filerShadowCopySetId }} , Client Shadow Copy ID : {{ $labels.clientShadowCopyId }} , Filer Shadow Copy ID : {{ $labels.filerShadowCopyId }} , Share Name : {{ $labels.shareName }}, Object type is: {{ $labels.object_type }} with UUID: {{ $labels.object_uuid }} )" + summary: "A shadow copy operation has failed: {{ $labels.errMsg }}. ( Operation : {{ $labels.operation }} , Client Shadow Copy Set ID : {{ $labels.client_shadow_copy_set_id }} , Filer Shadow Copy Set ID : {{ $labels.filer_shadow_copy_set_id }} , Client Shadow Copy ID : {{ $labels.client_shadow_copy_id }} , Filer Shadow Copy ID : {{ $labels.filer_shadow_copy_id }} , Share Name : {{ $labels.share }}, Object type is: {{ $labels.object_type }} with UUID: {{ $labels.object_uuid }} )" impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#shadow-copy-failed" @@ -647,7 +647,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "A module attempted to access credential information before the cloud credential thread initialized on node {{ $labels.nodeUuid }}." + summary: "A module attempted to access credential information before the cloud credential thread initialized on node {{ $labels.node_uuid }}." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#aws-credentials-not-initialized" @@ -671,7 +671,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Cluster switch: {{ $labels.switch_name }} power supply: {{ $labels.pwr_supply_name }} status: {{ $labels.status }}." + summary: "Cluster switch: {{ $labels.switch }} power supply: {{ $labels.pwr_supply }} status: {{ $labels.status }}." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#storage-switch-power-supplies-failed" @@ -695,7 +695,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Drive {{ $labels.diskName }} ({{ $labels.serialno }}){{ $labels.reason }}. Power-On Hours: {{ $labels.powerOnHours }}, GList Count: {{ $labels.glistEntries }}, Drive Info: {{ $labels.disk_information }}." + summary: "Drive {{ $labels.disk }} ({{ $labels.serial_no }}){{ $labels.reason }}. Power-On Hours: {{ $labels.power_on_hours }}, GList Count: {{ $labels.glistEntries }}, Drive Info: {{ $labels.disk_information }}." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#disk-out-of-service" @@ -719,7 +719,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "{{ $labels.location }} power supply was added to {{ $labels.channelName }}.shelf{{ $labels.shelfIdent }}" + summary: "{{ $labels.location }} power supply was added to {{ $labels.channel }}.shelf{{ $labels.shelf_ident }}" impact: "Configuration" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#disk-shelf-power-supply-discovered" @@ -743,7 +743,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "{{ $labels.location }} power supply was removed from {{ $labels.channelName }}.shelf{{ $labels.shelfIdent }}" + summary: "{{ $labels.location }} power supply was removed from {{ $labels.channel }}.shelf{{ $labels.shelf_ident }}" impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#disk-shelves-power-supply-removed" @@ -815,7 +815,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Giveback of aggregate '{{ $labels.vol }}' (uuid: {{ $labels.aggr_uuid }}) failed due to {{ $labels.reason }} preventing object store access on the destination node." + summary: "Giveback of aggregate '{{ $labels.volume }}' (uuid: {{ $labels.aggr_uuid }}) failed due to {{ $labels.reason }} preventing object store access on the destination node." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#giveback-of-storage-pool-failed" @@ -1055,7 +1055,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Unable to connect to the object store \"{{ $labels.configname }}\" from node {{ $labels.node_uuid }}. Reason: {{ $labels.reason }}." + summary: "Unable to connect to the object store \"{{ $labels.config }}\" from node {{ $labels.node_uuid }}. Reason: {{ $labels.reason }}." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#cloud-tier-unreachable" @@ -1079,7 +1079,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Object-store server host name \"{{ $labels.hostname }}\" cannot be resolved to an IP address on node {{ $labels.nodeUuid }}." + summary: "Object-store server host name \"{{ $labels.host }}\" cannot be resolved to an IP address on node {{ $labels.node_uuid }}." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#object-store-host-unresolvable" @@ -1103,7 +1103,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Object-store client could not find an operational intercluster LIF (IPspace ID: {{ $labels.ipspaceID }}) on node {{ $labels.nodeUuid }}." + summary: "Object-store client could not find an operational intercluster LIF (IPspace ID: {{ $labels.ipspace_id }}) on node {{ $labels.node_uuid }}." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#object-store-intercluster-lif-down" @@ -1127,7 +1127,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Object-store {{ $labels.operation }} operation server-calculated request signature does not match the signature sent to object-store server {{ $labels.serverHostname }} for bucket or container \"{{ $labels.bucket }}\" on node {{ $labels.nodeUuid }}. Check the keys and signing method." + summary: "Object-store {{ $labels.operation }} operation server-calculated request signature does not match the signature sent to object-store server {{ $labels.server_host }} for bucket or container \"{{ $labels.bucket }}\" on node {{ $labels.node_uuid }}. Check the keys and signing method." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#object-store-signature-mismatch" @@ -1199,7 +1199,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "FC target port {{ $labels.portname }} has {{ $labels.active_commands }} outstanding commands, which exceeds the maximum number of commands {{ $labels.max_commands }} that can be supported by this port." + summary: "FC target port {{ $labels.port }} has {{ $labels.active_commands }} outstanding commands, which exceeds the maximum number of commands {{ $labels.max_commands }} that can be supported by this port." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#fc-target-port-commands-exceeded" @@ -1271,7 +1271,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "{{ $labels.prodChannel }} cooling fan error for {{ $labels.typeText }} {{ $labels.fanNumber }}: {{ $labels.errorMsg }}{{ $labels.errorText }}. {{ $labels.locationText }}." + summary: "{{ $labels.prod_channel }} cooling fan error for {{ $labels.typeText }} {{ $labels.fan_number }}: {{ $labels.errorMsg }}{{ $labels.errorText }}. {{ $labels.locationText }}." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#shelf-fan-failed" @@ -1319,7 +1319,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "ONTAP Mediator (version {{ $labels.version }}) is added on cluster '{{ $labels.cluster }}' having peer cluster '{{ $labels.peerCluster }}' and mediator IP address '{{ $labels.ipAddress }}'." + summary: "ONTAP Mediator (version {{ $labels.version }}) is added on cluster '{{ $labels.cluster }}' having peer cluster '{{ $labels.peer_cluster }}' and mediator IP address '{{ $labels.ip_address }}'." impact: "Protection" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#ontap-mediator-added" @@ -1343,7 +1343,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "CA certificate of the ONTAP Mediator (IP: {{ $labels.ipAddress }}) expired on {{ $labels.expiryDate }}." + summary: "CA certificate of the ONTAP Mediator (IP: {{ $labels.ip_address }}) expired on {{ $labels.expiry_date }}." impact: "Protection" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#ontap-mediator-ca-certificate-expired" @@ -1367,7 +1367,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "CA certificate for the ONTAP Mediator (IP: {{ $labels.ipAddress }}) will expire in {{ $labels.daysToExpire }} days. Expiry: {{ $labels.expiryDate }}." + summary: "CA certificate for the ONTAP Mediator (IP: {{ $labels.ip_address }}) will expire in {{ $labels.days_to_expire }} days. Expiry: {{ $labels.expiry_date }}." impact: "Protection" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#ontap-mediator-ca-certificate-expiring" @@ -1391,7 +1391,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Client certificate of the ONTAP Mediator (IP: {{ $labels.ipAddress }}) expired on {{ $labels.expiryDate }}." + summary: "Client certificate of the ONTAP Mediator (IP: {{ $labels.ip_address }}) expired on {{ $labels.expiry_date }}." impact: "Protection" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#ontap-mediator-client-certificate-expired" @@ -1415,7 +1415,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Client certificate for the ONTAP Mediator (IP: {{ $labels.ipAddress }}) will expire in {{ $labels.daysToExpire }} days. Expiry: {{ $labels.expiryDate }}." + summary: "Client certificate for the ONTAP Mediator (IP: {{ $labels.ip_address }}) will expire in {{ $labels.days_to_expire }} days. Expiry: {{ $labels.expiry_date }}." impact: "Protection" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#ontap-mediator-client-certificate-expiring" @@ -1439,7 +1439,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "ONTAP Mediator is not accessible on cluster '{{ $labels.cluster }}' with Mediator IP address '{{ $labels.ipAddress }}'." + summary: "ONTAP Mediator is not accessible on cluster '{{ $labels.cluster }}' with Mediator IP address '{{ $labels.ip_address }}'." impact: "Protection" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#ontap-mediator-not-accessible" @@ -1463,7 +1463,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "ONTAP Mediator (version {{ $labels.version }}) was removed on cluster '{{ $labels.cluster }}' having peer cluster '{{ $labels.peerCluster }}' and mediator IP address '{{ $labels.ipAddress }}'." + summary: "ONTAP Mediator (version {{ $labels.version }}) was removed on cluster '{{ $labels.cluster }}' having peer cluster '{{ $labels.peer_cluster }}' and mediator IP address '{{ $labels.ip_address }}'." impact: "Protection" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#ontap-mediator-removed" @@ -1487,7 +1487,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Server certificate of the ONTAP Mediator (IP: {{ $labels.ipAddress }}) expired on {{ $labels.expiryDate }}." + summary: "Server certificate of the ONTAP Mediator (IP: {{ $labels.ip_address }}) expired on {{ $labels.expiry_date }}." impact: "Protection" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#ontap-mediator-server-certificate-expired" @@ -1511,7 +1511,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Server certificate for the ONTAP Mediator (IP: {{ $labels.ipAddress }}) will expire in {{ $labels.daysToExpire }} days. Expiry: {{ $labels.expiryDate }}." + summary: "Server certificate for the ONTAP Mediator (IP: {{ $labels.ip_address }}) will expire in {{ $labels.days_to_expire }} days. Expiry: {{ $labels.expiry_date }}." impact: "Protection" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#ontap-mediator-server-certificate-expiring" @@ -1535,7 +1535,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "ONTAP Mediator (IP: {{ $labels.ipAddress }}) is unreachable from cluster {{ $labels.cluster }}." + summary: "ONTAP Mediator (IP: {{ $labels.ip_address }}) is unreachable from cluster {{ $labels.cluster }}." impact: "Protection" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#ontap-mediator-unreachable" @@ -1559,7 +1559,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Source volume \"{{ $labels.srcpath }}\" and destination volume \"{{ $labels.dstpath }}\" with relationship UUID \"{{ $labels.relationship_id }}\" is in \"out-of-sync\" status due to the following reason: \"{{ $labels.error_msg }}\"." + summary: "Source volume \"{{ $labels.src_path }}\" and destination volume \"{{ $labels.dst_path }}\" with relationship UUID \"{{ $labels.relationship_id }}\" is in \"out-of-sync\" status due to the following reason: \"{{ $labels.error_msg }}\"." impact: "Protection" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#snapmirror-relationship-out-of-sync" @@ -1583,7 +1583,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Source CG \"{{ $labels.srccgpath }}\" and destination CG \"{{ $labels.dstcgpath }}\" with relationship UUID \"{{ $labels.cg_relationship_id }}\" is in \"out-of-sync\" status. Reason: \"{{ $labels.error_msg }}\"." + summary: "Source CG \"{{ $labels.src_cg_path }}\" and destination CG \"{{ $labels.dst_cg_path }}\" with relationship UUID \"{{ $labels.cg_relationship_id }}\" is in \"out-of-sync\" status. Reason: \"{{ $labels.error_msg }}\"." impact: "Protection" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#snapmirror-active-sync-relationship-out-of-sync" @@ -1679,7 +1679,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Vserver {{ $labels.vserver_name }} (UUID: {{ $labels.vserver_uuid }}) stopped successfully." + summary: "Vserver {{ $labels.svm }} (UUID: {{ $labels.svm_uuid }}) stopped successfully." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#storage-vm-stop-succeeded" @@ -1727,7 +1727,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "A READDIR file operation has expired for the directory associated with volume \"{{ $labels.volume }}{{ $labels.app }}/{{ $labels.volident }}\" Snapshot copy ID {{ $labels.snapid }} and inode {{ $labels.directory_inum }}." + summary: "A READDIR file operation has expired for the directory associated with volume \"{{ $labels.volume }}{{ $labels.app }}/{{ $labels.vol_ident }}\" Snapshot copy ID {{ $labels.snap_id }} and inode {{ $labels.directory_inum }}." impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#readdir-timeout" @@ -1751,7 +1751,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Volume autosize: Automatic {{ $labels.event_type }} of volume '{{ $labels.vol }}{{ $labels.app }}{{ $labels.volident }}' by {{ $labels.size }} is complete." + summary: "Volume autosize: Automatic {{ $labels.event_type }} of volume '{{ $labels.volume }}{{ $labels.app }}{{ $labels.vol_ident }}' by {{ $labels.size }} is complete." impact: "Capacity" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#volume-automatic-resizing-succeeded" @@ -1775,7 +1775,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Volume '{{ $labels.name }}{{ $labels.app }}{{ $labels.volident }}' has been set temporarily offline" + summary: "Volume '{{ $labels.volume }}{{ $labels.app }}{{ $labels.vol_ident }}' has been set temporarily offline" impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#volume-offline" @@ -1799,7 +1799,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "vol=\"{{ $labels.vol }}\", app=\"{{ $labels.app }}\", volident=\"{{ $labels.volident }}\", instuuid=\"{{ $labels.instuuid }}\"" + summary: "vol=\"{{ $labels.volume }}\", app=\"{{ $labels.app }}\", vol_ident=\"{{ $labels.vol_ident }}\", instuuid=\"{{ $labels.inst_uuid }}\"" impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#volume-restricted" @@ -1823,7 +1823,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Resynchronize operation between source volume \"{{ $labels.srcpath }}\" and destination volume \"{{ $labels.dstpath }}\" with relationship UUID \"{{ $labels.relationship_id }}\" has failed. The next auto-resync will be attempted after \"{{ $labels.next_resync_interval }}\" mins." + summary: "Resynchronize operation between source volume \"{{ $labels.src_path }}\" and destination volume \"{{ $labels.dst_path }}\" with relationship UUID \"{{ $labels.relationship_id }}\" has failed. The next auto-resync will be attempted after \"{{ $labels.next_resync_interval }}\" mins." impact: "Protection" - alert: SnapMirror Relationship Common Snapshot Failed @@ -1846,7 +1846,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Creating a common Snapshot copy for source volume \"{{ $labels.srcpath }}\" and destination volume \"{{ $labels.dstpath }}\" with relationship UUID \"{{ $labels.relationship_id }}\" has failed due to the following reason:\"{{ $labels.error_msg }}\". Elapsed time since the latest successful common Snapshot copy is \"{{ $labels.css_fail_interval }}\"." + summary: "Creating a common Snapshot copy for source volume \"{{ $labels.src_path }}\" and destination volume \"{{ $labels.dst_path }}\" with relationship UUID \"{{ $labels.relationship_id }}\" has failed due to the following reason:\"{{ $labels.error_msg }}\". Elapsed time since the latest successful common Snapshot copy is \"{{ $labels.css_fail_interval }}\"." impact: "Protection" - alert: SnapMirror Relationship Snapshot is not Replicated @@ -1869,7 +1869,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Snapshot copy \"{{ $labels.snapshot }}\" is not sucessfully replicated for the relationship \"{{ $labels.transferId }}\" with source volume DSID \"{{ $labels.volumeDSID }}\" and path \"{{ $labels.volumePath }}\". Reason: \"{{ $labels.failureReason }}\"." + summary: "Snapshot copy \"{{ $labels.snapshot }}\" is not sucessfully replicated for the relationship \"{{ $labels.transfer_id }}\" with source volume DSID \"{{ $labels.volume_DSID }}\" and path \"{{ $labels.volume_path }}\". Reason: \"{{ $labels.failure_reason }}\"." impact: "Protection" - alert: Fanout SnapMirror Relationship Common Snapshot Deleted @@ -1915,7 +1915,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "Initialize from source volume \"{{ $labels.srcpath }}\" to destination volume \"{{ $labels.dstpath }}\" with relationship UUID \"{{ $labels.relationship_id }}\" failed with error \"{{ $labels.error }}\"." + summary: "Initialize from source volume \"{{ $labels.src_path }}\" to destination volume \"{{ $labels.dst_path }}\" with relationship UUID \"{{ $labels.relationship_id }}\" failed with error \"{{ $labels.error }}\"." impact: "Protection" - alert: SnapMirror active sync Automatic Unplanned Failover Failed @@ -1938,7 +1938,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "SnapMirror automatic failover failed for Destination path: \"{{ $labels.dstpath }}\"." + summary: "SnapMirror automatic failover failed for Destination path: \"{{ $labels.dst_path }}\"." impact: "Protection" - alert: SnapMirror active sync Automatic Unplanned Failover Completed @@ -1961,7 +1961,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "SnapMirror automatic failover completed for Destination path: \"{{ $labels.dstpath }}\"." + summary: "SnapMirror automatic failover completed for Destination path: \"{{ $labels.dst_path }}\"." impact: "Protection" - alert: SnapMirror active sync Planned Failover Failed @@ -1984,7 +1984,7 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "SnapMirror active sync planned failover operation failed for Destination path: \"{{ $labels.dstpath }}\"." + summary: "SnapMirror active sync planned failover operation failed for Destination path: \"{{ $labels.dst_path }}\"." impact: "Protection" - alert: SnapMirror active sync Planned Failover Completed @@ -2007,5 +2007,28 @@ groups: {{- end -}} {{- end -}} annotations: - summary: "SnapMirror active sync planned failover operation completed for Destination path: \"{{ $labels.dstpath }}\"." - impact: "Protection" \ No newline at end of file + summary: "SnapMirror active sync planned failover operation completed for Destination path: \"{{ $labels.dst_path }}\"." + impact: "Protection" + + - alert: Directory size is approaching the maximum directory size (maxdirsize) limit + expr: last_over_time(ems_events{message="wafl.dir.size.warning"}[5m]) == 1 + labels: + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} + annotations: + summary: "Directory size for file ID \"{{ $labels.directory_inum }}\" in volume \"{{ $labels.volume }}{{ $labels.app }}/{{ $labels.vol_ident }}\" is approaching the maximum directory size (maxdirsize) limit." + impact: "Availability" diff --git a/docs/resources/ems-alert-runbook.md b/docs/resources/ems-alert-runbook.md index ecc2fd5bd..74981f4ee 100644 --- a/docs/resources/ems-alert-runbook.md +++ b/docs/resources/ems-alert-runbook.md @@ -52,6 +52,18 @@ If you use Cloud Volumes ONTAP, perform the following corrective actions: 2. Ensure that the login and connectivity information is still valid. Contact NetApp technical support if the issue persists. +### Directory size is approaching the maximum directory size (maxdirsize) limit + +**Impact**: Availability + +**EMS Event**: `wafl.dir.size.warning` + +This message occurs when the size of a directory surpasses a configured percentage (default: 90%) of its current maximum directory size (maxdirsize) limit. + +**Remediation** + +Use the "volume file show-inode" command with the file ID and volume name information to find the file path. Reduce the number of files in the directory. If not possible, use the (privilege:advanced) option "volume modify -volume vol_name -maxdir-size new_value" to increase the maximum number of files per directory. However, doing so could impact system performance. If you need to increase the maximum directory size, contact NetApp technical support. + ### Disk Out of Service **Impact**: Availability diff --git a/integration/test/alert_rule_test.go b/integration/test/alert_rule_test.go index 2b41c4106..0e6ff3c79 100644 --- a/integration/test/alert_rule_test.go +++ b/integration/test/alert_rule_test.go @@ -185,9 +185,9 @@ func parseEmsLabels(exports *node.Node) string { var labels []string if exports != nil { for _, export := range exports.GetAllChildContentS() { - name, _, _, _ := util.ParseMetric(export) + name, display, _, _ := util.ParseMetric(export) if strings.HasPrefix(name, "parameters") { - labels = append(labels, strings.Split(name, ".")[1]) + labels = append(labels, display) } } }