test_gpu_scripts/modules/health_check.py
qinyusen b6dff76ef7 add: health check module (temperature, power, ECC, PCIe, system checks)
Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-04-25 17:23:44 +08:00

300 lines
13 KiB
Python

"""Hardware health monitoring module for NVIDIA H200."""
import subprocess
import shutil
import os
from datetime import datetime
from typing import Optional
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.text import Text
class HealthCheck:
def __init__(self, config: dict):
self.config = config
self.console = Console()
self.health_cfg = config.get("health", {})
def _run_smi(self, query: str) -> Optional[str]:
if not shutil.which("nvidia-smi"):
return None
try:
r = subprocess.run(
["nvidia-smi", f"--query-gpu={query}", "--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=30,
)
return r.stdout.strip() if r.returncode == 0 else None
except (subprocess.TimeoutExpired, FileNotFoundError):
return None
def _run_cmd(self, cmd: list, timeout: int = 10) -> Optional[str]:
try:
r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
return r.stdout.strip() if r.returncode == 0 else None
except (subprocess.TimeoutExpired, FileNotFoundError):
return None
def _safe_int(self, val, default=0):
try:
return int(val) if val not in ("N/A", "", "[N/A]") else default
except (ValueError, TypeError):
return default
def _safe_float(self, val, default=0.0):
try:
return float(val) if val not in ("N/A", "", "[N/A]") else default
except (ValueError, TypeError):
return default
def run(self) -> dict:
if not shutil.which("nvidia-smi"):
return {"error": "nvidia-smi not found", "passed": False}
gpu_count_str = self._run_smi("count")
if not gpu_count_str:
return {"error": "nvidia-smi query failed", "passed": False}
gpu_count = int(gpu_count_str.split("\n")[0])
def query_lines(field):
raw = self._run_smi(field)
return raw.split("\n") if raw else []
temps = query_lines("temperature.gpu")
power_draws = query_lines("power.draw")
power_limits = query_lines("power.limit")
ecc_single = query_lines("ecc.errors.single_bit.total.volatile")
ecc_double = query_lines("ecc.errors.double_bit.total.volatile")
pcie_gens = query_lines("pcie_link.gen.current")
pcie_widths = query_lines("pcie_link.width.current")
clock_sms = query_lines("clocks.sm")
clock_mems = query_lines("clocks.mem")
persistence = query_lines("persistence_mode")
throttling_raw = query_lines("clocks_throttle_reasons.active")
mig_modes = query_lines("mig.mode.current")
temp_warn = self.health_cfg.get("temp_warning", 80)
temp_crit = self.health_cfg.get("temp_critical", 90)
power_lim = self.health_cfg.get("power_limit", 700)
gpu_health = []
overall_pass = True
for i in range(gpu_count):
checks = {}
temp_val = self._safe_int(temps[i] if i < len(temps) else 0)
if temp_val >= temp_crit:
checks["temperature"] = {"value": temp_val, "status": "FAIL", "threshold": temp_crit}
overall_pass = False
elif temp_val >= temp_warn:
checks["temperature"] = {"value": temp_val, "status": "WARN", "threshold": temp_warn}
else:
checks["temperature"] = {"value": temp_val, "status": "PASS", "threshold": temp_warn}
pd = self._safe_float(power_draws[i] if i < len(power_draws) else 0)
pl = self._safe_float(power_limits[i] if i < len(power_limits) else power_lim)
checks["power"] = {"value": pd, "limit": pl, "status": "PASS" if pd <= pl * 1.05 else "WARN"}
es = self._safe_int(ecc_single[i] if i < len(ecc_single) else 0)
ed = self._safe_int(ecc_double[i] if i < len(ecc_double) else 0)
ecc_status = "FAIL" if ed > 0 else ("WARN" if es > 100 else "PASS")
if ecc_status == "FAIL":
overall_pass = False
checks["ecc_errors"] = {"single": es, "double": ed, "status": ecc_status}
checks["memory_errors"] = {"status": "PASS"}
pg = self._safe_int(pcie_gens[i] if i < len(pcie_gens) else 0)
pw = self._safe_int(pcie_widths[i] if i < len(pcie_widths) else 0)
pcie_ok = pg >= 4 and pw >= 8
if not pcie_ok:
overall_pass = False
checks["pcie_link"] = {"gen": pg, "width": pw, "status": "PASS" if pcie_ok else "WARN"}
sm = self._safe_int(clock_sms[i] if i < len(clock_sms) else 0)
mm = self._safe_int(clock_mems[i] if i < len(clock_mems) else 0)
checks["clock_speed"] = {"sm": sm, "mem": mm, "status": "PASS" if sm > 0 and mm > 0 else "WARN"}
throttle_val = throttling_raw[i] if i < len(throttling_raw) else ""
throttle_active = throttle_val not in ("", "None", "Active", "N/A")
if throttle_active:
overall_pass = False
checks["throttling"] = {"status": "FAIL" if throttle_active else "PASS", "reasons": [throttle_val] if throttle_active else []}
pers_val = persistence[i] if i < len(persistence) else ""
pers_enabled = pers_val == "Enabled"
checks["persistence_mode"] = {"enabled": pers_enabled, "status": "PASS" if pers_enabled else "WARN"}
worst = "PASS"
for chk in checks.values():
s = chk["status"]
if s == "FAIL":
worst = "FAIL"
break
elif s == "WARN":
worst = "WARN"
if worst == "FAIL":
overall_pass = False
gpu_health.append({"index": i, "status": worst, "checks": checks})
system_health = self._check_system()
return {
"passed": overall_pass,
"gpu_health": gpu_health,
"system_health": system_health,
"timestamp": datetime.now().isoformat(),
}
def _check_system(self) -> dict:
persistd = shutil.which("nvidia-persistenced") is not None
persistd_running = False
if persistd:
r = self._run_cmd(["pgrep", "-x", "nvidia-persistenced"])
persistd_running = r is not None
hugepages = 0
hp_path = "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
if os.path.exists("/proc/meminfo"):
r = self._run_cmd(["grep", "-i", "hugepages_total", "/proc/meminfo"])
if r:
parts = r.split()
hugepages = int(parts[1]) if len(parts) >= 2 else 0
swap_status = False
if os.path.exists("/proc/swaps"):
r = self._run_cmd(["grep", "-c", "^/", "/proc/swaps"])
if r and int(r) > 0:
swap_status = True
thp = "unknown"
if os.path.exists("/sys/kernel/mm/transparent_hugepage/enabled"):
r = self._run_cmd(["cat", "/sys/kernel/mm/transparent_hugepage/enabled"])
if r:
if "[always]" in r:
thp = "always"
elif "[madvise]" in r:
thp = "madvise"
elif "[never]" in r:
thp = "never"
fd_soft, fd_max = 1024, 65535
try:
import resource
fd_soft, fd_max = resource.getrlimit(resource.RLIMIT_NOFILE)
except (ImportError, ValueError):
pass
ib_devs = []
if os.path.isdir("/sys/class/infiniband"):
ib_devs = os.listdir("/sys/class/infiniband")
rdma_devs = []
if os.path.isdir("/sys/class/infiniband_verbs"):
rdma_devs = os.listdir("/sys/class/infiniband_verbs")
nccl_env = {k: v for k, v in os.environ.items() if k.startswith("NCCL_")}
return {
"nvidia_persistenced": {"installed": persistd, "running": persistd_running},
"hugepages": {"configured": hugepages > 0, "count": hugepages},
"swap": {"enabled": swap_status},
"transparent_hugepage": thp,
"file_descriptors": {"soft": fd_soft, "max": fd_max},
"infiniband_devices": ib_devs,
"rdma_devices": rdma_devs,
"nccl_env_vars": nccl_env,
}
@staticmethod
def print_results(results: dict, console: Console = None):
c = console or Console()
if "error" in results:
c.print(f"[bold red]Error: {results['error']}[/bold red]")
return
passed = results.get("passed", False)
verdict = "[bold green]✓ ALL CHECKS PASSED[/bold green]" if passed else "[bold red]✗ SOME CHECKS FAILED[/bold red]"
c.print(Panel(verdict, border_style="green" if passed else "red"))
gpu_health = results.get("gpu_health", [])
if gpu_health:
table = Table(title="GPU Health Checks", box=None, padding=(0, 1))
table.add_column("GPU", style="bold", width=5)
table.add_column("Temp", width=10)
table.add_column("Power", width=12)
table.add_column("ECC", width=10)
table.add_column("PCIe", width=10)
table.add_column("Clock", width=8)
table.add_column("Throttle", width=10)
table.add_column("Persist", width=8)
table.add_column("Status", width=7)
for g in gpu_health:
ch = g["checks"]
status_color = "green" if g["status"] == "PASS" else ("yellow" if g["status"] == "WARN" else "red")
status_text = f"[{status_color}]{g['status']}[/{status_color}]"
def status_icon(s):
return {"PASS": "[green]✓[/green]", "WARN": "[yellow]![/yellow]", "FAIL": "[red]✗[/red]"}.get(s, s)
temp = f"{ch['temperature']['value']}°C {status_icon(ch['temperature']['status'])}"
pw = f"{ch['power']['value']:.0f}W {status_icon(ch['power']['status'])}"
ecc = f"S:{ch['ecc_errors']['single']} D:{ch['ecc_errors']['double']} {status_icon(ch['ecc_errors']['status'])}"
pcie = f"Gen{ch['pcie_link']['gen']}x{ch['pcie_link']['width']} {status_icon(ch['pcie_link']['status'])}"
clk = f"{ch['clock_speed']['sm']}MHz {status_icon(ch['clock_speed']['status'])}"
thr = status_icon(ch["throttling"]["status"])
pers = status_icon(ch["persistence_mode"]["status"])
table.add_row(str(g["index"]), temp, pw, ecc, pcie, clk, thr, pers, status_text)
c.print(table)
sys_h = results.get("system_health", {})
if sys_h:
c.print("\n[bold cyan]System Health[/bold cyan]")
np = sys_h.get("nvidia_persistenced", {})
np_status = "[green]Running[/green]" if np.get("running") else "[red]Not running[/red]"
if not np.get("installed"):
np_status = "[yellow]Not installed[/yellow]"
c.print(f" nvidia-persistenced : {np_status}")
hp = sys_h.get("hugepages", {})
hp_status = "[green]Configured[/green]" if hp.get("configured") else "[yellow]Not configured[/yellow]"
c.print(f" Hugepages : {hp_status} ({hp.get('count', 0)} pages)")
swap = sys_h.get("swap", {})
swap_txt = "[red]Enabled[/red]" if swap.get("enabled") else "[green]Disabled[/green]"
c.print(f" Swap : {swap_txt}")
thp = sys_h.get("transparent_hugepage", "unknown")
thp_color = "green" if thp in ("always", "madvise") else "yellow"
c.print(f" Transparent HP : [{thp_color}]{thp}[/{thp_color}]")
fd = sys_h.get("file_descriptors", {})
fd_ok = fd.get("soft", 0) >= 65536
fd_color = "green" if fd_ok else "yellow"
c.print(f" File Descriptors : [{fd_color}]{fd.get('soft', 'N/A')} (soft) / {fd.get('max', 'N/A')} (max)[/{fd_color}]")
ib = sys_h.get("infiniband_devices", [])
rdma = sys_h.get("rdma_devices", [])
if ib:
c.print(f" InfiniBand : [green]{', '.join(ib)}[/green]")
else:
c.print(" InfiniBand : [yellow]No devices detected[/yellow]")
if rdma:
c.print(f" RDMA : [green]{', '.join(rdma)}[/green]")
nccl = sys_h.get("nccl_env_vars", {})
if nccl:
c.print(" NCCL Env Vars:")
for k, v in sorted(nccl.items()):
c.print(f" {k}={v}")
else:
c.print(" NCCL Env Vars : [yellow]None set[/yellow]")