"""Hardware health monitoring module for NVIDIA datacenter GPUs.""" import subprocess import shutil import os from datetime import datetime from typing import Optional from rich.console import Console from rich.table import Table from rich.panel import Panel from rich.text import Text from modules.gpu_specs import detect_gpu_type, get_gpu_specs class HealthCheck: def __init__(self, config: dict): self.config = config self.console = Console() self.health_cfg = config.get("health", {}) self.gpu_type = detect_gpu_type() self.specs = get_gpu_specs(self.gpu_type) def _run_smi(self, query: str) -> Optional[str]: if not shutil.which("nvidia-smi"): return None try: r = subprocess.run( ["nvidia-smi", f"--query-gpu={query}", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=30, ) return r.stdout.strip() if r.returncode == 0 else None except (subprocess.TimeoutExpired, FileNotFoundError): return None def _run_cmd(self, cmd: list, timeout: int = 10) -> Optional[str]: try: r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) return r.stdout.strip() if r.returncode == 0 else None except (subprocess.TimeoutExpired, FileNotFoundError): return None def _safe_int(self, val, default=0): try: return int(val) if val not in ("N/A", "", "[N/A]") else default except (ValueError, TypeError): return default def _safe_float(self, val, default=0.0): try: return float(val) if val not in ("N/A", "", "[N/A]") else default except (ValueError, TypeError): return default def run(self) -> dict: if not shutil.which("nvidia-smi"): return {"error": "nvidia-smi not found", "passed": False} gpu_count_str = self._run_smi("count") if not gpu_count_str: return {"error": "nvidia-smi query failed", "passed": False} gpu_count = int(gpu_count_str.split("\n")[0]) def query_lines(field): raw = self._run_smi(field) return raw.split("\n") if raw else [] temps = query_lines("temperature.gpu") power_draws = query_lines("power.draw") power_limits = query_lines("power.limit") ecc_single = query_lines("ecc.errors.single_bit.total.volatile") ecc_double = query_lines("ecc.errors.double_bit.total.volatile") pcie_gens = query_lines("pcie.link.gen.current") pcie_widths = query_lines("pcie.link.width.current") clock_sms = query_lines("clocks.sm") clock_mems = query_lines("clocks.mem") persistence = query_lines("persistence_mode") throttling_raw = query_lines("clocks_throttle_reasons.active") mig_modes = query_lines("mig.mode.current") temp_warn = self.health_cfg.get("temp_warning", 80) temp_crit = self.health_cfg.get("temp_critical", 90) power_lim = self.health_cfg.get("power_limit", self.specs.get("tdp_watts", 700)) gpu_health = [] overall_pass = True for i in range(gpu_count): checks = {} temp_val = self._safe_int(temps[i] if i < len(temps) else 0) if temp_val >= temp_crit: checks["temperature"] = {"value": temp_val, "status": "FAIL", "threshold": temp_crit} overall_pass = False elif temp_val >= temp_warn: checks["temperature"] = {"value": temp_val, "status": "WARN", "threshold": temp_warn} else: checks["temperature"] = {"value": temp_val, "status": "PASS", "threshold": temp_warn} pd = self._safe_float(power_draws[i] if i < len(power_draws) else 0) pl = self._safe_float(power_limits[i] if i < len(power_limits) else power_lim) checks["power"] = {"value": pd, "limit": pl, "status": "PASS" if pd <= pl * 1.05 else "WARN"} es = self._safe_int(ecc_single[i] if i < len(ecc_single) else 0) ed = self._safe_int(ecc_double[i] if i < len(ecc_double) else 0) ecc_status = "FAIL" if ed > 0 else ("WARN" if es > 100 else "PASS") if ecc_status == "FAIL": overall_pass = False checks["ecc_errors"] = {"single": es, "double": ed, "status": ecc_status} checks["memory_errors"] = {"status": "PASS"} pg = self._safe_int(pcie_gens[i] if i < len(pcie_gens) else 0) pw = self._safe_int(pcie_widths[i] if i < len(pcie_widths) else 0) expected_gen = self.specs.get("pcie_gen", 0) if expected_gen > 0: pcie_ok = pg >= expected_gen and pw >= 16 else: pcie_ok = pw >= 8 # unknown GPU: just check width if not pcie_ok: overall_pass = False checks["pcie_link"] = {"gen": pg, "width": pw, "status": "PASS" if pcie_ok else "WARN"} sm = self._safe_int(clock_sms[i] if i < len(clock_sms) else 0) mm = self._safe_int(clock_mems[i] if i < len(clock_mems) else 0) checks["clock_speed"] = {"sm": sm, "mem": mm, "status": "PASS" if sm > 0 and mm > 0 else "WARN"} throttle_val = throttling_raw[i] if i < len(throttling_raw) else "" # Parse bitmask: 0x0 = none, 0x1 = gpu_idle (benign), others = real throttling throttle_reasons = [] try: bitmask = int(throttle_val, 16) if throttle_val.startswith("0x") else 0 except (ValueError, TypeError): bitmask = 0 # Bit 0 = gpu_idle — not a real problem, ignore it real_throttle = bitmask & ~0x1 if real_throttle: if real_throttle & 0x4: throttle_reasons.append("sw_power_cap") if real_throttle & 0x8: throttle_reasons.append("hw_slowdown") if real_throttle & 0x10: throttle_reasons.append("hw_thermal_slowdown") if real_throttle & 0x20: throttle_reasons.append("hw_power_brake") if real_throttle & 0x40: throttle_reasons.append("sw_thermal_slowdown") if not throttle_reasons: throttle_reasons.append(f"unknown(0x{real_throttle:x})") overall_pass = False checks["throttling"] = {"status": "FAIL" if real_throttle else "PASS", "reasons": throttle_reasons} pers_val = persistence[i] if i < len(persistence) else "" pers_enabled = pers_val == "Enabled" checks["persistence_mode"] = {"enabled": pers_enabled, "status": "PASS" if pers_enabled else "WARN"} worst = "PASS" for chk in checks.values(): s = chk["status"] if s == "FAIL": worst = "FAIL" break elif s == "WARN": worst = "WARN" if worst == "FAIL": overall_pass = False gpu_health.append({"index": i, "status": worst, "checks": checks}) system_health = self._check_system() return { "passed": overall_pass, "gpu_health": gpu_health, "system_health": system_health, "timestamp": datetime.now().isoformat(), "detected_gpu_type": self.gpu_type, } def _check_system(self) -> dict: persistd = shutil.which("nvidia-persistenced") is not None persistd_running = False if persistd: r = self._run_cmd(["pgrep", "-x", "nvidia-persistenced"]) persistd_running = r is not None hugepages = 0 hp_path = "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" if os.path.exists("/proc/meminfo"): r = self._run_cmd(["grep", "-i", "hugepages_total", "/proc/meminfo"]) if r: parts = r.split() hugepages = int(parts[1]) if len(parts) >= 2 else 0 swap_status = False if os.path.exists("/proc/swaps"): r = self._run_cmd(["grep", "-c", "^/", "/proc/swaps"]) if r and int(r) > 0: swap_status = True thp = "unknown" if os.path.exists("/sys/kernel/mm/transparent_hugepage/enabled"): r = self._run_cmd(["cat", "/sys/kernel/mm/transparent_hugepage/enabled"]) if r: if "[always]" in r: thp = "always" elif "[madvise]" in r: thp = "madvise" elif "[never]" in r: thp = "never" fd_soft, fd_max = 1024, 65535 try: import resource fd_soft, fd_max = resource.getrlimit(resource.RLIMIT_NOFILE) except (ImportError, ValueError): pass ib_devs = [] if os.path.isdir("/sys/class/infiniband"): ib_devs = os.listdir("/sys/class/infiniband") rdma_devs = [] if os.path.isdir("/sys/class/infiniband_verbs"): rdma_devs = os.listdir("/sys/class/infiniband_verbs") nccl_env = {k: v for k, v in os.environ.items() if k.startswith("NCCL_")} return { "nvidia_persistenced": {"installed": persistd, "running": persistd_running}, "hugepages": {"configured": hugepages > 0, "count": hugepages}, "swap": {"enabled": swap_status}, "transparent_hugepage": thp, "file_descriptors": {"soft": fd_soft, "max": fd_max}, "infiniband_devices": ib_devs, "rdma_devices": rdma_devs, "nccl_env_vars": nccl_env, } @staticmethod def print_results(results: dict, console: Console = None): c = console or Console() if "error" in results: c.print(f"[bold red]Error: {results['error']}[/bold red]") return passed = results.get("passed", False) verdict = "[bold green]✓ ALL CHECKS PASSED[/bold green]" if passed else "[bold red]✗ SOME CHECKS FAILED[/bold red]" c.print(Panel(verdict, border_style="green" if passed else "red")) gpu_health = results.get("gpu_health", []) if gpu_health: table = Table(title="GPU Health Checks", box=None, padding=(0, 1)) table.add_column("GPU", style="bold", width=5) table.add_column("Temp", width=10) table.add_column("Power", width=12) table.add_column("ECC", width=10) table.add_column("PCIe", width=10) table.add_column("Clock", width=8) table.add_column("Throttle", width=10) table.add_column("Persist", width=8) table.add_column("Status", width=7) for g in gpu_health: ch = g["checks"] status_color = "green" if g["status"] == "PASS" else ("yellow" if g["status"] == "WARN" else "red") status_text = f"[{status_color}]{g['status']}[/{status_color}]" def status_icon(s): return {"PASS": "[green]✓[/green]", "WARN": "[yellow]![/yellow]", "FAIL": "[red]✗[/red]"}.get(s, s) temp = f"{ch['temperature']['value']}°C {status_icon(ch['temperature']['status'])}" pw = f"{ch['power']['value']:.0f}W {status_icon(ch['power']['status'])}" ecc = f"S:{ch['ecc_errors']['single']} D:{ch['ecc_errors']['double']} {status_icon(ch['ecc_errors']['status'])}" pcie = f"Gen{ch['pcie_link']['gen']}x{ch['pcie_link']['width']} {status_icon(ch['pcie_link']['status'])}" clk = f"{ch['clock_speed']['sm']}MHz {status_icon(ch['clock_speed']['status'])}" thr = status_icon(ch["throttling"]["status"]) pers = status_icon(ch["persistence_mode"]["status"]) table.add_row(str(g["index"]), temp, pw, ecc, pcie, clk, thr, pers, status_text) c.print(table) sys_h = results.get("system_health", {}) if sys_h: c.print("\n[bold cyan]System Health[/bold cyan]") np = sys_h.get("nvidia_persistenced", {}) np_status = "[green]Running[/green]" if np.get("running") else "[red]Not running[/red]" if not np.get("installed"): np_status = "[yellow]Not installed[/yellow]" c.print(f" nvidia-persistenced : {np_status}") hp = sys_h.get("hugepages", {}) hp_status = "[green]Configured[/green]" if hp.get("configured") else "[yellow]Not configured[/yellow]" c.print(f" Hugepages : {hp_status} ({hp.get('count', 0)} pages)") swap = sys_h.get("swap", {}) swap_txt = "[red]Enabled[/red]" if swap.get("enabled") else "[green]Disabled[/green]" c.print(f" Swap : {swap_txt}") thp = sys_h.get("transparent_hugepage", "unknown") thp_color = "green" if thp in ("always", "madvise") else "yellow" c.print(f" Transparent HP : [{thp_color}]{thp}[/{thp_color}]") fd = sys_h.get("file_descriptors", {}) fd_ok = fd.get("soft", 0) >= 65536 fd_color = "green" if fd_ok else "yellow" c.print(f" File Descriptors : [{fd_color}]{fd.get('soft', 'N/A')} (soft) / {fd.get('max', 'N/A')} (max)[/{fd_color}]") ib = sys_h.get("infiniband_devices", []) rdma = sys_h.get("rdma_devices", []) if ib: c.print(f" InfiniBand : [green]{', '.join(ib)}[/green]") else: c.print(" InfiniBand : [yellow]No devices detected[/yellow]") if rdma: c.print(f" RDMA : [green]{', '.join(rdma)}[/green]") nccl = sys_h.get("nccl_env_vars", {}) if nccl: c.print(" NCCL Env Vars:") for k, v in sorted(nccl.items()): c.print(f" {k}={v}") else: c.print(" NCCL Env Vars : [yellow]None set[/yellow]")