From e32ad2af7f860dc2b9ea96a318b7cc7e21f5e832 Mon Sep 17 00:00:00 2001 From: "Robin H. Johnson" Date: Sun, 15 Oct 2023 22:59:57 -0700 Subject: [PATCH] feat: Better SCSI/SAS support, and removing confused metrics Implement better SCSI/SAS support, including less confused metrics. The exporter, prior to this PR, exports a value of "0" for some metrics that were specific to certain types of drives; these metrics will no longer be exported for types where that metric is not valid. Future work may include parsing the corresponding metrics for SATA/SAS SSDs. Metrics no longer exported for the wrong type of drive: - smartctl_device_nvme_capacity_bytes (NVME-specific) - smartctl_device_available_spare (NVME-specific, ATA possible) - smartctl_device_available_spare_threshold (NVME-specific, ATA possible) - smartctl_device_critical_warning (NVME-specific, ATA possible) - smartctl_device_interface_speed (ATA-specific) - smartctl_device_media_errors (NVME-specific, ATA possible) - smartctl_device_num_err_log_entries (NVME-specific, SCSI uses distinct metrics, ATA possible) - smartctl_device_nvme_capacity_bytes (NVME-specific) - smartctl_device_percentage_used (NVME-specific, ATA possible) Fix the following metrics that were exported as zero because the exporter did not know how to read them for SCSI devices: - smartctl_device_bytes_read - smartctl_device_bytes_written - smartctl_device_power_cycle_count New metrics: - smartctl_read_errors_corrected_by_eccdelayed - smartctl_read_errors_corrected_by_eccfast - smartctl_write_errors_corrected_by_eccdelayed - smartctl_write_errors_corrected_by_eccfast Fix labels: - smartctl_device{model_name} is now populated for SCSI/SAS, based on scsi_model_name. New labels: - smartctl_device{} gains: scsi_product,scsi_revision,scsi_vendor,scsi_version Signed-off-by: Robin H. Johnson --- metrics.go | 37 +++++++++ smartctl.go | 217 ++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 205 insertions(+), 49 deletions(-) diff --git a/metrics.go b/metrics.go index c675fd0..0ac083f 100644 --- a/metrics.go +++ b/metrics.go @@ -44,6 +44,11 @@ var ( "ata_version", "sata_version", "form_factor", + // scsi_model_name is mapped into model_name + "scsi_vendor", + "scsi_product", + "scsi_revision", + "scsi_version", }, nil, ) @@ -293,6 +298,22 @@ var ( }, nil, ) + metricReadErrorsCorrectedByEccFast = prometheus.NewDesc( + "smartctl_read_errors_corrected_by_eccfast", + "Read Errors Corrected by ECC Fast", + []string{ + "device", + }, + nil, + ) + metricReadErrorsCorrectedByEccDelayed = prometheus.NewDesc( + "smartctl_read_errors_corrected_by_eccdelayed", + "Read Errors Corrected by ECC Delayed", + []string{ + "device", + }, + nil, + ) metricReadTotalUncorrectedErrors = prometheus.NewDesc( "smartctl_read_total_uncorrected_errors", "Read Total Uncorrected Errors", @@ -309,6 +330,22 @@ var ( }, nil, ) + metricWriteErrorsCorrectedByEccFast = prometheus.NewDesc( + "smartctl_write_errors_corrected_by_eccfast", + "Write Errors Corrected by ECC Fast", + []string{ + "device", + }, + nil, + ) + metricWriteErrorsCorrectedByEccDelayed = prometheus.NewDesc( + "smartctl_write_errors_corrected_by_eccdelayed", + "Write Errors Corrected by ECC Delayed", + []string{ + "device", + }, + nil, + ) metricWriteTotalUncorrectedErrors = prometheus.NewDesc( "smartctl_write_total_uncorrected_errors", "Write Total Uncorrected Errors", diff --git a/smartctl.go b/smartctl.go index 23a374b..7f6934a 100644 --- a/smartctl.go +++ b/smartctl.go @@ -29,6 +29,9 @@ type SMARTDevice struct { serial string family string model string + // These are used to select types of metrics. + interface_ string + protocol string } // SMARTctl object @@ -41,15 +44,26 @@ type SMARTctl struct { // NewSMARTctl is smartctl constructor func NewSMARTctl(logger log.Logger, json gjson.Result, ch chan<- prometheus.Metric) SMARTctl { + var model_name string + if obj := json.Get("model_name"); obj.Exists() { + model_name = obj.String() + } else if obj := json.Get("scsi_model_name"); obj.Exists() { + model_name = obj.String() + } else { + model_name = "unknown" + } + return SMARTctl{ ch: ch, json: json, logger: logger, device: SMARTDevice{ - device: strings.TrimPrefix(strings.TrimSpace(json.Get("device.name").String()), "/dev/"), - serial: strings.TrimSpace(json.Get("serial_number").String()), - family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")), - model: strings.TrimSpace(json.Get("model_name").String()), + device: strings.TrimPrefix(strings.TrimSpace(json.Get("device.name").String()), "/dev/"), + serial: strings.TrimSpace(json.Get("serial_number").String()), + family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")), + model: strings.TrimSpace(model_name), + interface_: strings.TrimSpace(json.Get("device.type").String()), + protocol: strings.TrimSpace(json.Get("device.protocol").String()), }, } } @@ -66,23 +80,31 @@ func (smart *SMARTctl) Collect() { smart.minePowerOnSeconds() smart.mineRotationRate() smart.mineTemperatures() - smart.minePowerCycleCount() + smart.minePowerCycleCount() // ATA/SATA, NVME, SCSI, SAS smart.mineDeviceSCTStatus() smart.mineDeviceStatistics() smart.mineDeviceErrorLog() smart.mineDeviceSelfTestLog() smart.mineDeviceERC() - smart.mineNvmePercentageUsed() - smart.mineNvmeAvailableSpare() - smart.mineNvmeAvailableSpareThreshold() - smart.mineNvmeCriticalWarning() - smart.mineNvmeMediaErrors() - smart.mineNvmeNumErrLogEntries() - smart.mineNvmeBytesRead() - smart.mineNvmeBytesWritten() smart.mineSmartStatus() - smart.mineSCSIGrownDefectList() - smart.mineSCSIErrorCounterLog() + + if smart.device.interface_ == "nvme" { + smart.mineNvmePercentageUsed() + smart.mineNvmeAvailableSpare() + smart.mineNvmeAvailableSpareThreshold() + smart.mineNvmeCriticalWarning() + smart.mineNvmeMediaErrors() + smart.mineNvmeNumErrLogEntries() + smart.mineNvmeBytesRead() + smart.mineNvmeBytesWritten() + } + // SCSI, SAS + if smart.device.interface_ == "scsi" { + smart.mineSCSIGrownDefectList() + smart.mineSCSIErrorCounterLog() + smart.mineSCSIBytesRead() + smart.mineSCSIBytesWritten() + } } func (smart *SMARTctl) mineExitStatus() { @@ -95,14 +117,13 @@ func (smart *SMARTctl) mineExitStatus() { } func (smart *SMARTctl) mineDevice() { - device := smart.json.Get("device") smart.ch <- prometheus.MustNewConstMetric( metricDeviceModel, prometheus.GaugeValue, 1, smart.device.device, - device.Get("type").String(), - device.Get("protocol").String(), + smart.device.interface_, + smart.device.protocol, smart.device.family, smart.device.model, smart.device.serial, @@ -111,6 +132,11 @@ func (smart *SMARTctl) mineDevice() { smart.json.Get("ata_version.string").String(), smart.json.Get("sata_version.string").String(), smart.json.Get("form_factor.name").String(), + // scsi_model_name is mapped into model_name + smart.json.Get("scsi_vendor").String(), + smart.json.Get("scsi_product").String(), + smart.json.Get("scsi_revision").String(), + smart.json.Get("scsi_version").String(), ) } @@ -130,12 +156,15 @@ func (smart *SMARTctl) mineCapacity() { smart.json.Get("user_capacity.bytes").Float(), smart.device.device, ) - smart.ch <- prometheus.MustNewConstMetric( - metricDeviceTotalCapacityBytes, - prometheus.GaugeValue, - smart.json.Get("nvme_total_capacity").Float(), - smart.device.device, - ) + nvme_total_capacity := smart.json.Get("nvme_total_capacity") + if nvme_total_capacity.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceTotalCapacityBytes, + prometheus.GaugeValue, + nvme_total_capacity.Float(), + smart.device.device, + ) + } } func (smart *SMARTctl) mineBlockSize() { @@ -151,16 +180,21 @@ func (smart *SMARTctl) mineBlockSize() { } func (smart *SMARTctl) mineInterfaceSpeed() { + // TODO: Support scsi_sas_port_[01].phy_N.negotiated_logical_link_rate iSpeed := smart.json.Get("interface_speed") - for _, speedType := range []string{"max", "current"} { - tSpeed := iSpeed.Get(speedType) - smart.ch <- prometheus.MustNewConstMetric( - metricDeviceInterfaceSpeed, - prometheus.GaugeValue, - tSpeed.Get("units_per_second").Float()*tSpeed.Get("bits_per_unit").Float(), - smart.device.device, - speedType, - ) + if iSpeed.Exists() { + for _, speedType := range []string{"max", "current"} { + tSpeed := iSpeed.Get(speedType) + if tSpeed.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceInterfaceSpeed, + prometheus.GaugeValue, + tSpeed.Get("units_per_second").Float()*tSpeed.Get("bits_per_unit").Float(), + smart.device.device, + speedType, + ) + } + } } } @@ -200,16 +234,21 @@ func (smart *SMARTctl) mineDeviceAttribute() { func (smart *SMARTctl) minePowerOnSeconds() { pot := smart.json.Get("power_on_time") - smart.ch <- prometheus.MustNewConstMetric( - metricDevicePowerOnSeconds, - prometheus.CounterValue, - GetFloatIfExists(pot, "hours", 0)*60*60+GetFloatIfExists(pot, "minutes", 0)*60, - smart.device.device, - ) + // If the power_on_time is NOT present, do not report as 0. + if pot.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDevicePowerOnSeconds, + prometheus.CounterValue, + GetFloatIfExists(pot, "hours", 0)*60*60+GetFloatIfExists(pot, "minutes", 0)*60, + smart.device.device, + ) + } } func (smart *SMARTctl) mineRotationRate() { rRate := GetFloatIfExists(smart.json, "rotation_rate", 0) + // TODO: what should be done if this is absent vs really zero (for + // solid-state drives)? if rRate > 0 { smart.ch <- prometheus.MustNewConstMetric( metricDeviceRotationRate, @@ -222,6 +261,7 @@ func (smart *SMARTctl) mineRotationRate() { func (smart *SMARTctl) mineTemperatures() { temperatures := smart.json.Get("temperature") + // TODO: Implement scsi_environmental_reports if temperatures.Exists() { temperatures.ForEach(func(key, value gjson.Result) bool { smart.ch <- prometheus.MustNewConstMetric( @@ -237,12 +277,28 @@ func (smart *SMARTctl) mineTemperatures() { } func (smart *SMARTctl) minePowerCycleCount() { - smart.ch <- prometheus.MustNewConstMetric( - metricDevicePowerCycleCount, - prometheus.CounterValue, - smart.json.Get("power_cycle_count").Float(), - smart.device.device, - ) + // ATA & NVME + powerCycleCount := smart.json.Get("power_cycle_count") + if powerCycleCount.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDevicePowerCycleCount, + prometheus.CounterValue, + powerCycleCount.Float(), + smart.device.device, + ) + return + } + // SCSI + powerCycleCount = smart.json.Get("scsi_start_stop_cycle_counter.accumulated_start_stop_cycles") + if powerCycleCount.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDevicePowerCycleCount, + prometheus.CounterValue, + powerCycleCount.Float(), + smart.device.device, + ) + return + } } func (smart *SMARTctl) mineDeviceSCTStatus() { @@ -312,29 +368,67 @@ func (smart *SMARTctl) mineNvmeNumErrLogEntries() { } func (smart *SMARTctl) mineNvmeBytesRead() { - blockSize := smart.json.Get("logical_block_size").Float() + blockSize := smart.json.Get("logical_block_size") + data_units_read := smart.json.Get("nvme_smart_health_information_log.data_units_read") + if !blockSize.Exists() || !data_units_read.Exists() { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceBytesRead, prometheus.CounterValue, // This value is reported in thousands (i.e., a value of 1 corresponds to 1000 units of 512 bytes written) and is rounded up. // When the LBA size is a value other than 512 bytes, the controller shall convert the amount of data written to 512 byte units. - smart.json.Get("nvme_smart_health_information_log.data_units_read").Float()*1000.0*blockSize, + data_units_read.Float()*1000.0*blockSize.Float(), smart.device.device, ) } func (smart *SMARTctl) mineNvmeBytesWritten() { - blockSize := smart.json.Get("logical_block_size").Float() + blockSize := smart.json.Get("logical_block_size") + data_units_written := smart.json.Get("nvme_smart_health_information_log.data_units_written") + if !blockSize.Exists() || !data_units_written.Exists() { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceBytesWritten, prometheus.CounterValue, // This value is reported in thousands (i.e., a value of 1 corresponds to 1000 units of 512 bytes written) and is rounded up. // When the LBA size is a value other than 512 bytes, the controller shall convert the amount of data written to 512 byte units. - smart.json.Get("nvme_smart_health_information_log.data_units_written").Float()*1000.0*blockSize, + data_units_written.Float()*1000.0*blockSize.Float(), smart.device.device, ) } +func (smart *SMARTctl) mineSCSIBytesRead() { + SCSIHealth := smart.json.Get("scsi_error_counter_log") + if SCSIHealth.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceBytesRead, + prometheus.CounterValue, + // This value is reported by SMARTctl in GB [10^9]. + // It is possible that some drives mis-report the value, but + // that is not the responsibility of the exporter or smartctl + SCSIHealth.Get("read.gigabytes_processed").Float()*1e9, + smart.device.device, + ) + } +} + +func (smart *SMARTctl) mineSCSIBytesWritten() { + SCSIHealth := smart.json.Get("scsi_error_counter_log") + if SCSIHealth.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceBytesWritten, + prometheus.CounterValue, + // This value is reported by SMARTctl in GB [10^9]. + // It is possible that some drives mis-report the value, but + // that is not the responsibility of the exporter or smartctl + SCSIHealth.Get("write.gigabytes_processed").Float()*1e9, + smart.device.device, + ) + } +} + func (smart *SMARTctl) mineSmartStatus() { smart.ch <- prometheus.MustNewConstMetric( metricDeviceSmartStatus, @@ -460,6 +554,18 @@ func (smart *SMARTctl) mineSCSIErrorCounterLog() { SCSIHealth.Get("read.errors_corrected_by_rereads_rewrites").Float(), smart.device.device, ) + smart.ch <- prometheus.MustNewConstMetric( + metricReadErrorsCorrectedByEccFast, + prometheus.GaugeValue, + SCSIHealth.Get("read.errors_corrected_by_eccfast").Float(), + smart.device.device, + ) + smart.ch <- prometheus.MustNewConstMetric( + metricReadErrorsCorrectedByEccDelayed, + prometheus.GaugeValue, + SCSIHealth.Get("read.errors_corrected_by_eccdelayed").Float(), + smart.device.device, + ) smart.ch <- prometheus.MustNewConstMetric( metricReadTotalUncorrectedErrors, prometheus.GaugeValue, @@ -472,11 +578,24 @@ func (smart *SMARTctl) mineSCSIErrorCounterLog() { SCSIHealth.Get("write.errors_corrected_by_rereads_rewrites").Float(), smart.device.device, ) + smart.ch <- prometheus.MustNewConstMetric( + metricWriteErrorsCorrectedByEccFast, + prometheus.GaugeValue, + SCSIHealth.Get("write.errors_corrected_by_eccfast").Float(), + smart.device.device, + ) + smart.ch <- prometheus.MustNewConstMetric( + metricWriteErrorsCorrectedByEccDelayed, + prometheus.GaugeValue, + SCSIHealth.Get("write.errors_corrected_by_eccdelayed").Float(), + smart.device.device, + ) smart.ch <- prometheus.MustNewConstMetric( metricWriteTotalUncorrectedErrors, prometheus.GaugeValue, SCSIHealth.Get("write.total_uncorrected_errors").Float(), smart.device.device, ) + // TODO: Should we also export the verify category? } }