-
Notifications
You must be signed in to change notification settings - Fork 21
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
INFRA-388 Converting smartmon into python and adding mock tests #1327
base: stackhpc/2024.1
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import subprocess | ||
import json | ||
from datetime import datetime | ||
|
||
SMARTCTL_PATH = "/usr/sbin/smartctl" | ||
|
||
def run_command(command, parse_json=False): | ||
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) | ||
if parse_json: | ||
return json.loads(result.stdout) | ||
else: | ||
return result.stdout.strip() | ||
|
||
def parse_smartctl_attributes(disk, disk_type, serial, json_data): | ||
labels = f'disk="{disk}",type="{disk_type}",serial_number="{serial}"' | ||
metrics = [] | ||
smartmon_attrs = set([ | ||
"airflow_temperature_cel", "command_timeout", "current_pending_sector", "end_to_end_error", "erase_fail_count", | ||
"g_sense_error_rate", "hardware_ecc_recovered", "host_reads_32mib", "host_reads_mib", "host_writes_32mib", | ||
"host_writes_mib", "load_cycle_count", "media_wearout_indicator", "nand_writes_1gib", "offline_uncorrectable", | ||
"power_cycle_count", "power_on_hours", "program_fail_cnt_total", "program_fail_count", "raw_read_error_rate", | ||
"reallocated_event_count", "reallocated_sector_ct", "reported_uncorrect", "runtime_bad_block", "sata_downshift_count", | ||
"seek_error_rate", "spin_retry_count", "spin_up_time", "start_stop_count", "temperature_case", "temperature_celsius", | ||
"temperature_internal", "total_lbas_read", "total_lbas_written", "udma_crc_error_count", "unsafe_shutdown_count", | ||
"unused_rsvd_blk_cnt_tot", "wear_leveling_count", "workld_host_reads_perc", "workld_media_wear_indic", "workload_minutes", | ||
"critical_warning", "temperature", "available_spare", "available_spare_threshold", "percentage_used", | ||
"data_units_read", "data_units_written", "host_reads", "host_writes", "controller_busy_time", | ||
"power_cycles", "unsafe_shutdowns", "media_errors", "num_err_log_entries", | ||
"warning_temp_time", "critical_comp_time" | ||
]) | ||
if 'nvme_smart_health_information_log' in json_data: | ||
smart_log = json_data['nvme_smart_health_information_log'] | ||
for attr_name, value in smart_log.items(): | ||
attr_name = attr_name.replace(' ', '_').lower() | ||
if attr_name in smartmon_attrs: | ||
metrics.append(f"{attr_name}{{{labels}}} {value}") | ||
elif 'scsi_grown_defect_list' in json_data: | ||
scsi_attrs = json_data.get('scsi_grown_defect_list', {}) | ||
for attr_name, value in scsi_attrs.items(): | ||
attr_name = attr_name.replace(' ', '_').lower() | ||
if attr_name in smartmon_attrs: | ||
metrics.append(f"{attr_name}{{{labels}}} {value}") | ||
elif 'ata_smart_attributes' in json_data and 'table' in json_data['ata_smart_attributes']: | ||
for attr in json_data['ata_smart_attributes']['table']: | ||
attr_name = attr['name'].replace('-', '_').lower() | ||
if attr_name in smartmon_attrs: | ||
attr_id = attr.get('id', '') | ||
value = attr.get('value', '') | ||
worst = attr.get('worst', '') | ||
threshold = attr.get('thresh', '') | ||
raw_value = attr.get('raw', {}).get('value', '') | ||
metrics.append(f"{attr_name}_value{{{labels},smart_id=\"{attr_id}\"}} {value}") | ||
metrics.append(f"{attr_name}_worst{{{labels},smart_id=\"{attr_id}\"}} {worst}") | ||
metrics.append(f"{attr_name}_threshold{{{labels},smart_id=\"{attr_id}\"}} {threshold}") | ||
metrics.append(f"{attr_name}_raw_value{{{labels},smart_id=\"{attr_id}\"}} {raw_value}") | ||
return metrics | ||
|
||
def parse_smartctl_info(disk, disk_type, json_data): | ||
info = json_data.get('device', {}) | ||
smart_status = json_data.get('smart_status', {}) | ||
labels = { | ||
'disk': disk, | ||
'type': disk_type, | ||
'vendor': info.get('vendor', ''), | ||
'product': info.get('product', ''), | ||
'revision': info.get('revision', ''), | ||
'lun_id': info.get('lun_id', ''), | ||
'model_family': json_data.get('model_family', ''), | ||
'device_model': json_data.get('model_name', ''), | ||
'serial_number': json_data.get('serial_number', '').lower(), | ||
'firmware_version': json_data.get('firmware_version', '') | ||
} | ||
label_str = ','.join(f'{k}="{v}"' for k, v in labels.items()) | ||
metrics = [ | ||
f'device_info{{{label_str}}} 1', | ||
f'device_smart_available{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("available", False) else 0}', | ||
] | ||
if smart_status.get("available", False): | ||
metrics.append(f'device_smart_enabled{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("enabled", False) else 0}') | ||
if 'passed' in smart_status: | ||
metrics.append(f'device_smart_healthy{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("passed", False) else 0}') | ||
return metrics | ||
|
||
def format_output(metrics): | ||
output = [] | ||
last_metric = "" | ||
for metric in sorted(metrics): | ||
metric_name = metric.split('{')[0] | ||
if metric_name != last_metric: | ||
output.append(f"# HELP smartmon_{metric_name} SMART metric {metric_name}") | ||
output.append(f"# TYPE smartmon_{metric_name} gauge") | ||
last_metric = metric_name | ||
output.append(f"smartmon_{metric}") | ||
return '\n'.join(output) | ||
|
||
def main(): | ||
try: | ||
version_output = run_command([SMARTCTL_PATH, '-j'], parse_json=True) | ||
smartctl_version_list = version_output.get('smartctl', {}).get('version', []) | ||
if smartctl_version_list: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have you looked at Python libraries for doing this? Eg. This one looks reasonable: https://pypi.org/project/pySMART/ It can handle loading all the device metrics in via It seems to provide some abstraction over the health state as well. The work you've put into the tests should be directly usable |
||
smartctl_version_str = '.'.join(map(str, smartctl_version_list)) | ||
else: | ||
smartctl_version_str = "unknown" | ||
except json.JSONDecodeError: | ||
smartctl_version_str = "unknown" | ||
metrics = [f'smartctl_version{{version="{smartctl_version_str}"}} 1'] | ||
|
||
try: | ||
device_list_output = run_command([SMARTCTL_PATH, '--scan-open', '-j'], parse_json=True) | ||
devices = [] | ||
for device in device_list_output.get('devices', []): | ||
disk = device.get('name', '') | ||
disk_type = device.get('type', 'auto') | ||
if disk: | ||
devices.append((disk, disk_type)) | ||
except json.JSONDecodeError: | ||
devices = [] | ||
|
||
for disk, disk_type in devices: | ||
serial_number = '' | ||
active = 1 | ||
metrics.append(f'smartctl_run{{disk="{disk}",type="{disk_type}"}} {int(datetime.utcnow().timestamp())}') | ||
|
||
try: | ||
standby_output = run_command([SMARTCTL_PATH, '-n', 'standby', '-d', disk_type, '-j', disk], parse_json=True) | ||
power_mode = standby_output.get('power_mode', '') | ||
if power_mode == 'standby': | ||
active = 0 | ||
except json.JSONDecodeError: | ||
active = 0 # Assume device is inactive if we can't parse the output | ||
|
||
metrics.append(f'device_active{{disk="{disk}",type="{disk_type}"}} {active}') | ||
|
||
if active == 0: | ||
continue | ||
|
||
try: | ||
info_output = run_command([SMARTCTL_PATH, '-i', '-H', '-d', disk_type, '-j', disk], parse_json=True) | ||
except json.JSONDecodeError: | ||
continue | ||
metrics.extend(parse_smartctl_info(disk, disk_type, info_output)) | ||
serial_number = info_output.get('serial_number', '').lower() | ||
|
||
try: | ||
attributes_output = run_command([SMARTCTL_PATH, '-A', '-d', disk_type, '-j', disk], parse_json=True) | ||
except json.JSONDecodeError: | ||
continue | ||
metrics.extend(parse_smartctl_attributes(disk, disk_type, serial_number, attributes_output)) | ||
|
||
formatted_output = format_output(metrics) | ||
print(formatted_output) | ||
|
||
if __name__ == "__main__": | ||
main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
style nit: It would be helpful to move these to a 'constant' eg.
SMARTMON_ATTRS = set(..
and put each attribute on a new line.