Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

INFRA-388 Converting smartmon into python and adding mock tests #1327

Draft
wants to merge 2 commits into
base: stackhpc/2024.1
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 156 additions & 0 deletions etc/kayobe/ansible/scripts/smartmon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
#!/usr/bin/env python3

import subprocess
import json
from datetime import datetime

SMARTCTL_PATH = "/usr/sbin/smartctl"

def run_command(command, parse_json=False):
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if parse_json:
return json.loads(result.stdout)
else:
return result.stdout.strip()

def parse_smartctl_attributes(disk, disk_type, serial, json_data):
labels = f'disk="{disk}",type="{disk_type}",serial_number="{serial}"'
metrics = []
smartmon_attrs = set([
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style nit: It would be helpful to move these to a 'constant' eg. SMARTMON_ATTRS = set(.. and put each attribute on a new line.

"airflow_temperature_cel", "command_timeout", "current_pending_sector", "end_to_end_error", "erase_fail_count",
"g_sense_error_rate", "hardware_ecc_recovered", "host_reads_32mib", "host_reads_mib", "host_writes_32mib",
"host_writes_mib", "load_cycle_count", "media_wearout_indicator", "nand_writes_1gib", "offline_uncorrectable",
"power_cycle_count", "power_on_hours", "program_fail_cnt_total", "program_fail_count", "raw_read_error_rate",
"reallocated_event_count", "reallocated_sector_ct", "reported_uncorrect", "runtime_bad_block", "sata_downshift_count",
"seek_error_rate", "spin_retry_count", "spin_up_time", "start_stop_count", "temperature_case", "temperature_celsius",
"temperature_internal", "total_lbas_read", "total_lbas_written", "udma_crc_error_count", "unsafe_shutdown_count",
"unused_rsvd_blk_cnt_tot", "wear_leveling_count", "workld_host_reads_perc", "workld_media_wear_indic", "workload_minutes",
"critical_warning", "temperature", "available_spare", "available_spare_threshold", "percentage_used",
"data_units_read", "data_units_written", "host_reads", "host_writes", "controller_busy_time",
"power_cycles", "unsafe_shutdowns", "media_errors", "num_err_log_entries",
"warning_temp_time", "critical_comp_time"
])
if 'nvme_smart_health_information_log' in json_data:
smart_log = json_data['nvme_smart_health_information_log']
for attr_name, value in smart_log.items():
attr_name = attr_name.replace(' ', '_').lower()
if attr_name in smartmon_attrs:
metrics.append(f"{attr_name}{{{labels}}} {value}")
elif 'scsi_grown_defect_list' in json_data:
scsi_attrs = json_data.get('scsi_grown_defect_list', {})
for attr_name, value in scsi_attrs.items():
attr_name = attr_name.replace(' ', '_').lower()
if attr_name in smartmon_attrs:
metrics.append(f"{attr_name}{{{labels}}} {value}")
elif 'ata_smart_attributes' in json_data and 'table' in json_data['ata_smart_attributes']:
for attr in json_data['ata_smart_attributes']['table']:
attr_name = attr['name'].replace('-', '_').lower()
if attr_name in smartmon_attrs:
attr_id = attr.get('id', '')
value = attr.get('value', '')
worst = attr.get('worst', '')
threshold = attr.get('thresh', '')
raw_value = attr.get('raw', {}).get('value', '')
metrics.append(f"{attr_name}_value{{{labels},smart_id=\"{attr_id}\"}} {value}")
metrics.append(f"{attr_name}_worst{{{labels},smart_id=\"{attr_id}\"}} {worst}")
metrics.append(f"{attr_name}_threshold{{{labels},smart_id=\"{attr_id}\"}} {threshold}")
metrics.append(f"{attr_name}_raw_value{{{labels},smart_id=\"{attr_id}\"}} {raw_value}")
return metrics

def parse_smartctl_info(disk, disk_type, json_data):
info = json_data.get('device', {})
smart_status = json_data.get('smart_status', {})
labels = {
'disk': disk,
'type': disk_type,
'vendor': info.get('vendor', ''),
'product': info.get('product', ''),
'revision': info.get('revision', ''),
'lun_id': info.get('lun_id', ''),
'model_family': json_data.get('model_family', ''),
'device_model': json_data.get('model_name', ''),
'serial_number': json_data.get('serial_number', '').lower(),
'firmware_version': json_data.get('firmware_version', '')
}
label_str = ','.join(f'{k}="{v}"' for k, v in labels.items())
metrics = [
f'device_info{{{label_str}}} 1',
f'device_smart_available{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("available", False) else 0}',
]
if smart_status.get("available", False):
metrics.append(f'device_smart_enabled{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("enabled", False) else 0}')
if 'passed' in smart_status:
metrics.append(f'device_smart_healthy{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("passed", False) else 0}')
return metrics

def format_output(metrics):
output = []
last_metric = ""
for metric in sorted(metrics):
metric_name = metric.split('{')[0]
if metric_name != last_metric:
output.append(f"# HELP smartmon_{metric_name} SMART metric {metric_name}")
output.append(f"# TYPE smartmon_{metric_name} gauge")
last_metric = metric_name
output.append(f"smartmon_{metric}")
return '\n'.join(output)

def main():
try:
version_output = run_command([SMARTCTL_PATH, '-j'], parse_json=True)
smartctl_version_list = version_output.get('smartctl', {}).get('version', [])
if smartctl_version_list:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you looked at Python libraries for doing this?

Eg. This one looks reasonable: https://pypi.org/project/pySMART/

It can handle loading all the device metrics in via smartctl as you are doing here.

It seems to provide some abstraction over the health state as well.

The work you've put into the tests should be directly usable

smartctl_version_str = '.'.join(map(str, smartctl_version_list))
else:
smartctl_version_str = "unknown"
except json.JSONDecodeError:
smartctl_version_str = "unknown"
metrics = [f'smartctl_version{{version="{smartctl_version_str}"}} 1']

try:
device_list_output = run_command([SMARTCTL_PATH, '--scan-open', '-j'], parse_json=True)
devices = []
for device in device_list_output.get('devices', []):
disk = device.get('name', '')
disk_type = device.get('type', 'auto')
if disk:
devices.append((disk, disk_type))
except json.JSONDecodeError:
devices = []

for disk, disk_type in devices:
serial_number = ''
active = 1
metrics.append(f'smartctl_run{{disk="{disk}",type="{disk_type}"}} {int(datetime.utcnow().timestamp())}')

try:
standby_output = run_command([SMARTCTL_PATH, '-n', 'standby', '-d', disk_type, '-j', disk], parse_json=True)
power_mode = standby_output.get('power_mode', '')
if power_mode == 'standby':
active = 0
except json.JSONDecodeError:
active = 0 # Assume device is inactive if we can't parse the output

metrics.append(f'device_active{{disk="{disk}",type="{disk_type}"}} {active}')

if active == 0:
continue

try:
info_output = run_command([SMARTCTL_PATH, '-i', '-H', '-d', disk_type, '-j', disk], parse_json=True)
except json.JSONDecodeError:
continue
metrics.extend(parse_smartctl_info(disk, disk_type, info_output))
serial_number = info_output.get('serial_number', '').lower()

try:
attributes_output = run_command([SMARTCTL_PATH, '-A', '-d', disk_type, '-j', disk], parse_json=True)
except json.JSONDecodeError:
continue
metrics.extend(parse_smartctl_attributes(disk, disk_type, serial_number, attributes_output))

formatted_output = format_output(metrics)
print(formatted_output)

if __name__ == "__main__":
main()
Loading