test_gpu_scripts/modules/health_check.py
hongshuai.dong 1db1313d50 fix(health): mark PCIe link downgrade as FAIL instead of WARN
The PCIe link health check was producing inconsistent verdicts: when the
negotiated link did not meet the GPU's expected Gen/Width (e.g. an H200
running at Gen4 instead of Gen5, or any GPU dropping below x16), the code
correctly flipped overall_pass to False — but recorded the per-GPU status
as "WARN" rather than "FAIL".

This mismatch broke the convention used by every other check in the
module (temperature, ECC, throttling), where FAIL is the only status
that drives overall_pass=False, and WARN is purely informational. As a
result the rendered Markdown / table output would show a yellow WARN
badge for the affected GPU while the overall Health Check verdict came
back red FAIL, leaving operators to wonder which signal to trust.

A PCIe link downgrade is not a soft warning — it halves H2D/D2H
bandwidth (Gen5 x16 ~64 GB/s -> Gen4 x16 ~32 GB/s), directly impacting
data loading, checkpoint I/O, and ZeRO/offload throughput. For an
acceptance-test tool this should be a hard failure, consistent with how
overall_pass already treats it.

Change: in modules/health_check.py, set status to "FAIL" (not "WARN")
when pcie_ok is False. This applies to both the known-GPU path
(Gen >= expected and Width >= 16) and the unknown-GPU fallback path
(Width >= 8). No behavioral change to overall_pass — only the per-GPU
status string is corrected so the table view, Markdown report, and the
overall verdict now agree.
2026-05-10 17:23:51 +08:00

328 lines
14 KiB
Python

"""Hardware health monitoring module for NVIDIA datacenter GPUs."""
import subprocess
import shutil
import os
from datetime import datetime
from typing import Optional
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.text import Text
from modules.gpu_specs import detect_gpu_type, get_gpu_specs
class HealthCheck:
def __init__(self, config: dict):
self.config = config
self.console = Console()
self.health_cfg = config.get("health", {})
self.gpu_type = detect_gpu_type()
self.specs = get_gpu_specs(self.gpu_type)
def _run_smi(self, query: str) -> Optional[str]:
if not shutil.which("nvidia-smi"):
return None
try:
r = subprocess.run(
["nvidia-smi", f"--query-gpu={query}", "--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=30,
)
return r.stdout.strip() if r.returncode == 0 else None
except (subprocess.TimeoutExpired, FileNotFoundError):
return None
def _run_cmd(self, cmd: list, timeout: int = 10) -> Optional[str]:
try:
r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
return r.stdout.strip() if r.returncode == 0 else None
except (subprocess.TimeoutExpired, FileNotFoundError):
return None
def _safe_int(self, val, default=0):
try:
return int(val) if val not in ("N/A", "", "[N/A]") else default
except (ValueError, TypeError):
return default
def _safe_float(self, val, default=0.0):
try:
return float(val) if val not in ("N/A", "", "[N/A]") else default
except (ValueError, TypeError):
return default
def run(self) -> dict:
if not shutil.which("nvidia-smi"):
return {"error": "nvidia-smi not found", "passed": False}
gpu_count_str = self._run_smi("count")
if not gpu_count_str:
return {"error": "nvidia-smi query failed", "passed": False}
gpu_count = int(gpu_count_str.split("\n")[0])
def query_lines(field):
raw = self._run_smi(field)
return raw.split("\n") if raw else []
temps = query_lines("temperature.gpu")
power_draws = query_lines("power.draw")
power_limits = query_lines("power.limit")
ecc_single = query_lines("ecc.errors.single_bit.total.volatile")
ecc_double = query_lines("ecc.errors.double_bit.total.volatile")
pcie_gens = query_lines("pcie.link.gen.current")
pcie_widths = query_lines("pcie.link.width.current")
clock_sms = query_lines("clocks.sm")
clock_mems = query_lines("clocks.mem")
persistence = query_lines("persistence_mode")
throttling_raw = query_lines("clocks_throttle_reasons.active")
mig_modes = query_lines("mig.mode.current")
temp_warn = self.health_cfg.get("temp_warning", 80)
temp_crit = self.health_cfg.get("temp_critical", 90)
power_lim = self.health_cfg.get("power_limit", self.specs.get("tdp_watts", 700))
gpu_health = []
overall_pass = True
for i in range(gpu_count):
checks = {}
temp_val = self._safe_int(temps[i] if i < len(temps) else 0)
if temp_val >= temp_crit:
checks["temperature"] = {"value": temp_val, "status": "FAIL", "threshold": temp_crit}
overall_pass = False
elif temp_val >= temp_warn:
checks["temperature"] = {"value": temp_val, "status": "WARN", "threshold": temp_warn}
else:
checks["temperature"] = {"value": temp_val, "status": "PASS", "threshold": temp_warn}
pd = self._safe_float(power_draws[i] if i < len(power_draws) else 0)
pl = self._safe_float(power_limits[i] if i < len(power_limits) else power_lim)
checks["power"] = {"value": pd, "limit": pl, "status": "PASS" if pd <= pl * 1.05 else "WARN"}
es = self._safe_int(ecc_single[i] if i < len(ecc_single) else 0)
ed = self._safe_int(ecc_double[i] if i < len(ecc_double) else 0)
ecc_status = "FAIL" if ed > 0 else ("WARN" if es > 100 else "PASS")
if ecc_status == "FAIL":
overall_pass = False
checks["ecc_errors"] = {"single": es, "double": ed, "status": ecc_status}
checks["memory_errors"] = {"status": "PASS"}
pg = self._safe_int(pcie_gens[i] if i < len(pcie_gens) else 0)
pw = self._safe_int(pcie_widths[i] if i < len(pcie_widths) else 0)
expected_gen = self.specs.get("pcie_gen", 0)
if expected_gen > 0:
pcie_ok = pg >= expected_gen and pw >= 16
else:
pcie_ok = pw >= 8 # unknown GPU: just check width
if not pcie_ok:
overall_pass = False
checks["pcie_link"] = {"gen": pg, "width": pw, "status": "PASS" if pcie_ok else "FAIL"}
sm = self._safe_int(clock_sms[i] if i < len(clock_sms) else 0)
mm = self._safe_int(clock_mems[i] if i < len(clock_mems) else 0)
checks["clock_speed"] = {"sm": sm, "mem": mm, "status": "PASS" if sm > 0 and mm > 0 else "WARN"}
throttle_val = throttling_raw[i] if i < len(throttling_raw) else ""
# Parse bitmask: 0x0 = none, 0x1 = gpu_idle (benign), others = real throttling
throttle_reasons = []
try:
bitmask = int(throttle_val, 16) if throttle_val.startswith("0x") else 0
except (ValueError, TypeError):
bitmask = 0
# Bit 0 = gpu_idle — not a real problem, ignore it
real_throttle = bitmask & ~0x1
if real_throttle:
if real_throttle & 0x4:
throttle_reasons.append("sw_power_cap")
if real_throttle & 0x8:
throttle_reasons.append("hw_slowdown")
if real_throttle & 0x10:
throttle_reasons.append("hw_thermal_slowdown")
if real_throttle & 0x20:
throttle_reasons.append("hw_power_brake")
if real_throttle & 0x40:
throttle_reasons.append("sw_thermal_slowdown")
if not throttle_reasons:
throttle_reasons.append(f"unknown(0x{real_throttle:x})")
overall_pass = False
checks["throttling"] = {"status": "FAIL" if real_throttle else "PASS", "reasons": throttle_reasons}
pers_val = persistence[i] if i < len(persistence) else ""
pers_enabled = pers_val == "Enabled"
checks["persistence_mode"] = {"enabled": pers_enabled, "status": "PASS" if pers_enabled else "WARN"}
worst = "PASS"
for chk in checks.values():
s = chk["status"]
if s == "FAIL":
worst = "FAIL"
break
elif s == "WARN":
worst = "WARN"
if worst == "FAIL":
overall_pass = False
gpu_health.append({"index": i, "status": worst, "checks": checks})
system_health = self._check_system()
return {
"passed": overall_pass,
"gpu_health": gpu_health,
"system_health": system_health,
"timestamp": datetime.now().isoformat(),
"detected_gpu_type": self.gpu_type,
}
def _check_system(self) -> dict:
persistd = shutil.which("nvidia-persistenced") is not None
persistd_running = False
if persistd:
r = self._run_cmd(["pgrep", "-x", "nvidia-persistenced"])
persistd_running = r is not None
hugepages = 0
hp_path = "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
if os.path.exists("/proc/meminfo"):
r = self._run_cmd(["grep", "-i", "hugepages_total", "/proc/meminfo"])
if r:
parts = r.split()
hugepages = int(parts[1]) if len(parts) >= 2 else 0
swap_status = False
if os.path.exists("/proc/swaps"):
r = self._run_cmd(["grep", "-c", "^/", "/proc/swaps"])
if r and int(r) > 0:
swap_status = True
thp = "unknown"
if os.path.exists("/sys/kernel/mm/transparent_hugepage/enabled"):
r = self._run_cmd(["cat", "/sys/kernel/mm/transparent_hugepage/enabled"])
if r:
if "[always]" in r:
thp = "always"
elif "[madvise]" in r:
thp = "madvise"
elif "[never]" in r:
thp = "never"
fd_soft, fd_max = 1024, 65535
try:
import resource
fd_soft, fd_max = resource.getrlimit(resource.RLIMIT_NOFILE)
except (ImportError, ValueError):
pass
ib_devs = []
if os.path.isdir("/sys/class/infiniband"):
ib_devs = os.listdir("/sys/class/infiniband")
rdma_devs = []
if os.path.isdir("/sys/class/infiniband_verbs"):
rdma_devs = os.listdir("/sys/class/infiniband_verbs")
nccl_env = {k: v for k, v in os.environ.items() if k.startswith("NCCL_")}
return {
"nvidia_persistenced": {"installed": persistd, "running": persistd_running},
"hugepages": {"configured": hugepages > 0, "count": hugepages},
"swap": {"enabled": swap_status},
"transparent_hugepage": thp,
"file_descriptors": {"soft": fd_soft, "max": fd_max},
"infiniband_devices": ib_devs,
"rdma_devices": rdma_devs,
"nccl_env_vars": nccl_env,
}
@staticmethod
def print_results(results: dict, console: Console = None):
c = console or Console()
if "error" in results:
c.print(f"[bold red]Error: {results['error']}[/bold red]")
return
passed = results.get("passed", False)
verdict = "[bold green]✓ ALL CHECKS PASSED[/bold green]" if passed else "[bold red]✗ SOME CHECKS FAILED[/bold red]"
c.print(Panel(verdict, border_style="green" if passed else "red"))
gpu_health = results.get("gpu_health", [])
if gpu_health:
table = Table(title="GPU Health Checks", box=None, padding=(0, 1))
table.add_column("GPU", style="bold", width=5)
table.add_column("Temp", width=10)
table.add_column("Power", width=12)
table.add_column("ECC", width=10)
table.add_column("PCIe", width=10)
table.add_column("Clock", width=8)
table.add_column("Throttle", width=10)
table.add_column("Persist", width=8)
table.add_column("Status", width=7)
for g in gpu_health:
ch = g["checks"]
status_color = "green" if g["status"] == "PASS" else ("yellow" if g["status"] == "WARN" else "red")
status_text = f"[{status_color}]{g['status']}[/{status_color}]"
def status_icon(s):
return {"PASS": "[green]✓[/green]", "WARN": "[yellow]![/yellow]", "FAIL": "[red]✗[/red]"}.get(s, s)
temp = f"{ch['temperature']['value']}°C {status_icon(ch['temperature']['status'])}"
pw = f"{ch['power']['value']:.0f}W {status_icon(ch['power']['status'])}"
ecc = f"S:{ch['ecc_errors']['single']} D:{ch['ecc_errors']['double']} {status_icon(ch['ecc_errors']['status'])}"
pcie = f"Gen{ch['pcie_link']['gen']}x{ch['pcie_link']['width']} {status_icon(ch['pcie_link']['status'])}"
clk = f"{ch['clock_speed']['sm']}MHz {status_icon(ch['clock_speed']['status'])}"
thr = status_icon(ch["throttling"]["status"])
pers = status_icon(ch["persistence_mode"]["status"])
table.add_row(str(g["index"]), temp, pw, ecc, pcie, clk, thr, pers, status_text)
c.print(table)
sys_h = results.get("system_health", {})
if sys_h:
c.print("\n[bold cyan]System Health[/bold cyan]")
np = sys_h.get("nvidia_persistenced", {})
np_status = "[green]Running[/green]" if np.get("running") else "[red]Not running[/red]"
if not np.get("installed"):
np_status = "[yellow]Not installed[/yellow]"
c.print(f" nvidia-persistenced : {np_status}")
hp = sys_h.get("hugepages", {})
hp_status = "[green]Configured[/green]" if hp.get("configured") else "[yellow]Not configured[/yellow]"
c.print(f" Hugepages : {hp_status} ({hp.get('count', 0)} pages)")
swap = sys_h.get("swap", {})
swap_txt = "[red]Enabled[/red]" if swap.get("enabled") else "[green]Disabled[/green]"
c.print(f" Swap : {swap_txt}")
thp = sys_h.get("transparent_hugepage", "unknown")
thp_color = "green" if thp in ("always", "madvise") else "yellow"
c.print(f" Transparent HP : [{thp_color}]{thp}[/{thp_color}]")
fd = sys_h.get("file_descriptors", {})
fd_ok = fd.get("soft", 0) >= 65536
fd_color = "green" if fd_ok else "yellow"
c.print(f" File Descriptors : [{fd_color}]{fd.get('soft', 'N/A')} (soft) / {fd.get('max', 'N/A')} (max)[/{fd_color}]")
ib = sys_h.get("infiniband_devices", [])
rdma = sys_h.get("rdma_devices", [])
if ib:
c.print(f" InfiniBand : [green]{', '.join(ib)}[/green]")
else:
c.print(" InfiniBand : [yellow]No devices detected[/yellow]")
if rdma:
c.print(f" RDMA : [green]{', '.join(rdma)}[/green]")
nccl = sys_h.get("nccl_env_vars", {})
if nccl:
c.print(" NCCL Env Vars:")
for k, v in sorted(nccl.items()):
c.print(f" {k}={v}")
else:
c.print(" NCCL Env Vars : [yellow]None set[/yellow]")