|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import subprocess |
| 4 | +import json |
| 5 | +from datetime import datetime |
| 6 | + |
| 7 | +SMARTCTL_PATH = "/usr/sbin/smartctl" |
| 8 | + |
| 9 | +def run_command(command, parse_json=False): |
| 10 | + result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) |
| 11 | + if parse_json: |
| 12 | + return json.loads(result.stdout) |
| 13 | + else: |
| 14 | + return result.stdout.strip() |
| 15 | + |
| 16 | +def parse_smartctl_attributes(disk, disk_type, serial, json_data): |
| 17 | + labels = f'disk="{disk}",type="{disk_type}",serial_number="{serial}"' |
| 18 | + metrics = [] |
| 19 | + smartmon_attrs = set([ |
| 20 | + "airflow_temperature_cel", "command_timeout", "current_pending_sector", "end_to_end_error", "erase_fail_count", |
| 21 | + "g_sense_error_rate", "hardware_ecc_recovered", "host_reads_32mib", "host_reads_mib", "host_writes_32mib", |
| 22 | + "host_writes_mib", "load_cycle_count", "media_wearout_indicator", "nand_writes_1gib", "offline_uncorrectable", |
| 23 | + "power_cycle_count", "power_on_hours", "program_fail_cnt_total", "program_fail_count", "raw_read_error_rate", |
| 24 | + "reallocated_event_count", "reallocated_sector_ct", "reported_uncorrect", "runtime_bad_block", "sata_downshift_count", |
| 25 | + "seek_error_rate", "spin_retry_count", "spin_up_time", "start_stop_count", "temperature_case", "temperature_celsius", |
| 26 | + "temperature_internal", "total_lbas_read", "total_lbas_written", "udma_crc_error_count", "unsafe_shutdown_count", |
| 27 | + "unused_rsvd_blk_cnt_tot", "wear_leveling_count", "workld_host_reads_perc", "workld_media_wear_indic", "workload_minutes", |
| 28 | + "critical_warning", "temperature", "available_spare", "available_spare_threshold", "percentage_used", |
| 29 | + "data_units_read", "data_units_written", "host_reads", "host_writes", "controller_busy_time", |
| 30 | + "power_cycles", "unsafe_shutdowns", "media_errors", "num_err_log_entries", |
| 31 | + "warning_temp_time", "critical_comp_time" |
| 32 | + ]) |
| 33 | + if 'nvme_smart_health_information_log' in json_data: |
| 34 | + smart_log = json_data['nvme_smart_health_information_log'] |
| 35 | + for attr_name, value in smart_log.items(): |
| 36 | + attr_name = attr_name.replace(' ', '_').lower() |
| 37 | + if attr_name in smartmon_attrs: |
| 38 | + metrics.append(f"{attr_name}{{{labels}}} {value}") |
| 39 | + elif 'scsi_grown_defect_list' in json_data: |
| 40 | + scsi_attrs = json_data.get('scsi_grown_defect_list', {}) |
| 41 | + for attr_name, value in scsi_attrs.items(): |
| 42 | + attr_name = attr_name.replace(' ', '_').lower() |
| 43 | + if attr_name in smartmon_attrs: |
| 44 | + metrics.append(f"{attr_name}{{{labels}}} {value}") |
| 45 | + elif 'ata_smart_attributes' in json_data and 'table' in json_data['ata_smart_attributes']: |
| 46 | + for attr in json_data['ata_smart_attributes']['table']: |
| 47 | + attr_name = attr['name'].replace('-', '_').lower() |
| 48 | + if attr_name in smartmon_attrs: |
| 49 | + attr_id = attr.get('id', '') |
| 50 | + value = attr.get('value', '') |
| 51 | + worst = attr.get('worst', '') |
| 52 | + threshold = attr.get('thresh', '') |
| 53 | + raw_value = attr.get('raw', {}).get('value', '') |
| 54 | + metrics.append(f"{attr_name}_value{{{labels},smart_id=\"{attr_id}\"}} {value}") |
| 55 | + metrics.append(f"{attr_name}_worst{{{labels},smart_id=\"{attr_id}\"}} {worst}") |
| 56 | + metrics.append(f"{attr_name}_threshold{{{labels},smart_id=\"{attr_id}\"}} {threshold}") |
| 57 | + metrics.append(f"{attr_name}_raw_value{{{labels},smart_id=\"{attr_id}\"}} {raw_value}") |
| 58 | + return metrics |
| 59 | + |
| 60 | +def parse_smartctl_info(disk, disk_type, json_data): |
| 61 | + info = json_data.get('device', {}) |
| 62 | + smart_status = json_data.get('smart_status', {}) |
| 63 | + labels = { |
| 64 | + 'disk': disk, |
| 65 | + 'type': disk_type, |
| 66 | + 'vendor': info.get('vendor', ''), |
| 67 | + 'product': info.get('product', ''), |
| 68 | + 'revision': info.get('revision', ''), |
| 69 | + 'lun_id': info.get('lun_id', ''), |
| 70 | + 'model_family': json_data.get('model_family', ''), |
| 71 | + 'device_model': json_data.get('model_name', ''), |
| 72 | + 'serial_number': json_data.get('serial_number', '').lower(), |
| 73 | + 'firmware_version': json_data.get('firmware_version', '') |
| 74 | + } |
| 75 | + label_str = ','.join(f'{k}="{v}"' for k, v in labels.items()) |
| 76 | + metrics = [ |
| 77 | + f'device_info{{{label_str}}} 1', |
| 78 | + f'device_smart_available{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("available", False) else 0}', |
| 79 | + ] |
| 80 | + if smart_status.get("available", False): |
| 81 | + metrics.append(f'device_smart_enabled{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("enabled", False) else 0}') |
| 82 | + if 'passed' in smart_status: |
| 83 | + metrics.append(f'device_smart_healthy{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("passed", False) else 0}') |
| 84 | + return metrics |
| 85 | + |
| 86 | +def format_output(metrics): |
| 87 | + output = [] |
| 88 | + last_metric = "" |
| 89 | + for metric in sorted(metrics): |
| 90 | + metric_name = metric.split('{')[0] |
| 91 | + if metric_name != last_metric: |
| 92 | + output.append(f"# HELP smartmon_{metric_name} SMART metric {metric_name}") |
| 93 | + output.append(f"# TYPE smartmon_{metric_name} gauge") |
| 94 | + last_metric = metric_name |
| 95 | + output.append(f"smartmon_{metric}") |
| 96 | + return '\n'.join(output) |
| 97 | + |
| 98 | +def main(): |
| 99 | + try: |
| 100 | + version_output = run_command([SMARTCTL_PATH, '-j'], parse_json=True) |
| 101 | + smartctl_version_list = version_output.get('smartctl', {}).get('version', []) |
| 102 | + if smartctl_version_list: |
| 103 | + smartctl_version_str = '.'.join(map(str, smartctl_version_list)) |
| 104 | + else: |
| 105 | + smartctl_version_str = "unknown" |
| 106 | + except json.JSONDecodeError: |
| 107 | + smartctl_version_str = "unknown" |
| 108 | + metrics = [f'smartctl_version{{version="{smartctl_version_str}"}} 1'] |
| 109 | + |
| 110 | + try: |
| 111 | + device_list_output = run_command([SMARTCTL_PATH, '--scan-open', '-j'], parse_json=True) |
| 112 | + devices = [] |
| 113 | + for device in device_list_output.get('devices', []): |
| 114 | + disk = device.get('name', '') |
| 115 | + disk_type = device.get('type', 'auto') |
| 116 | + if disk: |
| 117 | + devices.append((disk, disk_type)) |
| 118 | + except json.JSONDecodeError: |
| 119 | + devices = [] |
| 120 | + |
| 121 | + for disk, disk_type in devices: |
| 122 | + serial_number = '' |
| 123 | + active = 1 |
| 124 | + metrics.append(f'smartctl_run{{disk="{disk}",type="{disk_type}"}} {int(datetime.utcnow().timestamp())}') |
| 125 | + |
| 126 | + try: |
| 127 | + standby_output = run_command([SMARTCTL_PATH, '-n', 'standby', '-d', disk_type, '-j', disk], parse_json=True) |
| 128 | + power_mode = standby_output.get('power_mode', '') |
| 129 | + if power_mode == 'standby': |
| 130 | + active = 0 |
| 131 | + except json.JSONDecodeError: |
| 132 | + active = 0 # Assume device is inactive if we can't parse the output |
| 133 | + |
| 134 | + metrics.append(f'device_active{{disk="{disk}",type="{disk_type}"}} {active}') |
| 135 | + |
| 136 | + if active == 0: |
| 137 | + continue |
| 138 | + |
| 139 | + try: |
| 140 | + info_output = run_command([SMARTCTL_PATH, '-i', '-H', '-d', disk_type, '-j', disk], parse_json=True) |
| 141 | + except json.JSONDecodeError: |
| 142 | + continue |
| 143 | + metrics.extend(parse_smartctl_info(disk, disk_type, info_output)) |
| 144 | + serial_number = info_output.get('serial_number', '').lower() |
| 145 | + |
| 146 | + try: |
| 147 | + attributes_output = run_command([SMARTCTL_PATH, '-A', '-d', disk_type, '-j', disk], parse_json=True) |
| 148 | + except json.JSONDecodeError: |
| 149 | + continue |
| 150 | + metrics.extend(parse_smartctl_attributes(disk, disk_type, serial_number, attributes_output)) |
| 151 | + |
| 152 | + formatted_output = format_output(metrics) |
| 153 | + print(formatted_output) |
| 154 | + |
| 155 | +if __name__ == "__main__": |
| 156 | + main() |
0 commit comments