The PCIe link health check was producing inconsistent verdicts: when the negotiated link did not meet the GPU's expected Gen/Width (e.g. an H200 running at Gen4 instead of Gen5, or any GPU dropping below x16), the code correctly flipped overall_pass to False — but recorded the per-GPU status as "WARN" rather than "FAIL". This mismatch broke the convention used by every other check in the module (temperature, ECC, throttling), where FAIL is the only status that drives overall_pass=False, and WARN is purely informational. As a result the rendered Markdown / table output would show a yellow WARN badge for the affected GPU while the overall Health Check verdict came back red FAIL, leaving operators to wonder which signal to trust. A PCIe link downgrade is not a soft warning — it halves H2D/D2H bandwidth (Gen5 x16 ~64 GB/s -> Gen4 x16 ~32 GB/s), directly impacting data loading, checkpoint I/O, and ZeRO/offload throughput. For an acceptance-test tool this should be a hard failure, consistent with how overall_pass already treats it. Change: in modules/health_check.py, set status to "FAIL" (not "WARN") when pcie_ok is False. This applies to both the known-GPU path (Gen >= expected and Width >= 16) and the unknown-GPU fallback path (Width >= 8). No behavioral change to overall_pass — only the per-GPU status string is corrected so the table view, Markdown report, and the overall verdict now agree.
328 lines
14 KiB
Python
328 lines
14 KiB
Python
"""Hardware health monitoring module for NVIDIA datacenter GPUs."""
|
|
|
|
import subprocess
|
|
import shutil
|
|
import os
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
|
|
from rich.console import Console
|
|
from rich.table import Table
|
|
from rich.panel import Panel
|
|
from rich.text import Text
|
|
|
|
from modules.gpu_specs import detect_gpu_type, get_gpu_specs
|
|
|
|
|
|
class HealthCheck:
|
|
|
|
def __init__(self, config: dict):
|
|
self.config = config
|
|
self.console = Console()
|
|
self.health_cfg = config.get("health", {})
|
|
self.gpu_type = detect_gpu_type()
|
|
self.specs = get_gpu_specs(self.gpu_type)
|
|
|
|
def _run_smi(self, query: str) -> Optional[str]:
|
|
if not shutil.which("nvidia-smi"):
|
|
return None
|
|
try:
|
|
r = subprocess.run(
|
|
["nvidia-smi", f"--query-gpu={query}", "--format=csv,noheader,nounits"],
|
|
capture_output=True, text=True, timeout=30,
|
|
)
|
|
return r.stdout.strip() if r.returncode == 0 else None
|
|
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
return None
|
|
|
|
def _run_cmd(self, cmd: list, timeout: int = 10) -> Optional[str]:
|
|
try:
|
|
r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
|
|
return r.stdout.strip() if r.returncode == 0 else None
|
|
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
return None
|
|
|
|
def _safe_int(self, val, default=0):
|
|
try:
|
|
return int(val) if val not in ("N/A", "", "[N/A]") else default
|
|
except (ValueError, TypeError):
|
|
return default
|
|
|
|
def _safe_float(self, val, default=0.0):
|
|
try:
|
|
return float(val) if val not in ("N/A", "", "[N/A]") else default
|
|
except (ValueError, TypeError):
|
|
return default
|
|
|
|
def run(self) -> dict:
|
|
if not shutil.which("nvidia-smi"):
|
|
return {"error": "nvidia-smi not found", "passed": False}
|
|
|
|
gpu_count_str = self._run_smi("count")
|
|
if not gpu_count_str:
|
|
return {"error": "nvidia-smi query failed", "passed": False}
|
|
gpu_count = int(gpu_count_str.split("\n")[0])
|
|
|
|
def query_lines(field):
|
|
raw = self._run_smi(field)
|
|
return raw.split("\n") if raw else []
|
|
|
|
temps = query_lines("temperature.gpu")
|
|
power_draws = query_lines("power.draw")
|
|
power_limits = query_lines("power.limit")
|
|
ecc_single = query_lines("ecc.errors.single_bit.total.volatile")
|
|
ecc_double = query_lines("ecc.errors.double_bit.total.volatile")
|
|
pcie_gens = query_lines("pcie.link.gen.current")
|
|
pcie_widths = query_lines("pcie.link.width.current")
|
|
clock_sms = query_lines("clocks.sm")
|
|
clock_mems = query_lines("clocks.mem")
|
|
persistence = query_lines("persistence_mode")
|
|
|
|
throttling_raw = query_lines("clocks_throttle_reasons.active")
|
|
mig_modes = query_lines("mig.mode.current")
|
|
|
|
temp_warn = self.health_cfg.get("temp_warning", 80)
|
|
temp_crit = self.health_cfg.get("temp_critical", 90)
|
|
power_lim = self.health_cfg.get("power_limit", self.specs.get("tdp_watts", 700))
|
|
|
|
gpu_health = []
|
|
overall_pass = True
|
|
|
|
for i in range(gpu_count):
|
|
checks = {}
|
|
|
|
temp_val = self._safe_int(temps[i] if i < len(temps) else 0)
|
|
if temp_val >= temp_crit:
|
|
checks["temperature"] = {"value": temp_val, "status": "FAIL", "threshold": temp_crit}
|
|
overall_pass = False
|
|
elif temp_val >= temp_warn:
|
|
checks["temperature"] = {"value": temp_val, "status": "WARN", "threshold": temp_warn}
|
|
else:
|
|
checks["temperature"] = {"value": temp_val, "status": "PASS", "threshold": temp_warn}
|
|
|
|
pd = self._safe_float(power_draws[i] if i < len(power_draws) else 0)
|
|
pl = self._safe_float(power_limits[i] if i < len(power_limits) else power_lim)
|
|
checks["power"] = {"value": pd, "limit": pl, "status": "PASS" if pd <= pl * 1.05 else "WARN"}
|
|
|
|
es = self._safe_int(ecc_single[i] if i < len(ecc_single) else 0)
|
|
ed = self._safe_int(ecc_double[i] if i < len(ecc_double) else 0)
|
|
ecc_status = "FAIL" if ed > 0 else ("WARN" if es > 100 else "PASS")
|
|
if ecc_status == "FAIL":
|
|
overall_pass = False
|
|
checks["ecc_errors"] = {"single": es, "double": ed, "status": ecc_status}
|
|
|
|
checks["memory_errors"] = {"status": "PASS"}
|
|
|
|
pg = self._safe_int(pcie_gens[i] if i < len(pcie_gens) else 0)
|
|
pw = self._safe_int(pcie_widths[i] if i < len(pcie_widths) else 0)
|
|
expected_gen = self.specs.get("pcie_gen", 0)
|
|
if expected_gen > 0:
|
|
pcie_ok = pg >= expected_gen and pw >= 16
|
|
else:
|
|
pcie_ok = pw >= 8 # unknown GPU: just check width
|
|
if not pcie_ok:
|
|
overall_pass = False
|
|
checks["pcie_link"] = {"gen": pg, "width": pw, "status": "PASS" if pcie_ok else "FAIL"}
|
|
|
|
sm = self._safe_int(clock_sms[i] if i < len(clock_sms) else 0)
|
|
mm = self._safe_int(clock_mems[i] if i < len(clock_mems) else 0)
|
|
checks["clock_speed"] = {"sm": sm, "mem": mm, "status": "PASS" if sm > 0 and mm > 0 else "WARN"}
|
|
|
|
throttle_val = throttling_raw[i] if i < len(throttling_raw) else ""
|
|
# Parse bitmask: 0x0 = none, 0x1 = gpu_idle (benign), others = real throttling
|
|
throttle_reasons = []
|
|
try:
|
|
bitmask = int(throttle_val, 16) if throttle_val.startswith("0x") else 0
|
|
except (ValueError, TypeError):
|
|
bitmask = 0
|
|
# Bit 0 = gpu_idle — not a real problem, ignore it
|
|
real_throttle = bitmask & ~0x1
|
|
if real_throttle:
|
|
if real_throttle & 0x4:
|
|
throttle_reasons.append("sw_power_cap")
|
|
if real_throttle & 0x8:
|
|
throttle_reasons.append("hw_slowdown")
|
|
if real_throttle & 0x10:
|
|
throttle_reasons.append("hw_thermal_slowdown")
|
|
if real_throttle & 0x20:
|
|
throttle_reasons.append("hw_power_brake")
|
|
if real_throttle & 0x40:
|
|
throttle_reasons.append("sw_thermal_slowdown")
|
|
if not throttle_reasons:
|
|
throttle_reasons.append(f"unknown(0x{real_throttle:x})")
|
|
overall_pass = False
|
|
checks["throttling"] = {"status": "FAIL" if real_throttle else "PASS", "reasons": throttle_reasons}
|
|
|
|
pers_val = persistence[i] if i < len(persistence) else ""
|
|
pers_enabled = pers_val == "Enabled"
|
|
checks["persistence_mode"] = {"enabled": pers_enabled, "status": "PASS" if pers_enabled else "WARN"}
|
|
|
|
worst = "PASS"
|
|
for chk in checks.values():
|
|
s = chk["status"]
|
|
if s == "FAIL":
|
|
worst = "FAIL"
|
|
break
|
|
elif s == "WARN":
|
|
worst = "WARN"
|
|
if worst == "FAIL":
|
|
overall_pass = False
|
|
|
|
gpu_health.append({"index": i, "status": worst, "checks": checks})
|
|
|
|
system_health = self._check_system()
|
|
|
|
return {
|
|
"passed": overall_pass,
|
|
"gpu_health": gpu_health,
|
|
"system_health": system_health,
|
|
"timestamp": datetime.now().isoformat(),
|
|
"detected_gpu_type": self.gpu_type,
|
|
}
|
|
|
|
def _check_system(self) -> dict:
|
|
persistd = shutil.which("nvidia-persistenced") is not None
|
|
persistd_running = False
|
|
if persistd:
|
|
r = self._run_cmd(["pgrep", "-x", "nvidia-persistenced"])
|
|
persistd_running = r is not None
|
|
|
|
hugepages = 0
|
|
hp_path = "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
|
|
if os.path.exists("/proc/meminfo"):
|
|
r = self._run_cmd(["grep", "-i", "hugepages_total", "/proc/meminfo"])
|
|
if r:
|
|
parts = r.split()
|
|
hugepages = int(parts[1]) if len(parts) >= 2 else 0
|
|
|
|
swap_status = False
|
|
if os.path.exists("/proc/swaps"):
|
|
r = self._run_cmd(["grep", "-c", "^/", "/proc/swaps"])
|
|
if r and int(r) > 0:
|
|
swap_status = True
|
|
|
|
thp = "unknown"
|
|
if os.path.exists("/sys/kernel/mm/transparent_hugepage/enabled"):
|
|
r = self._run_cmd(["cat", "/sys/kernel/mm/transparent_hugepage/enabled"])
|
|
if r:
|
|
if "[always]" in r:
|
|
thp = "always"
|
|
elif "[madvise]" in r:
|
|
thp = "madvise"
|
|
elif "[never]" in r:
|
|
thp = "never"
|
|
|
|
fd_soft, fd_max = 1024, 65535
|
|
try:
|
|
import resource
|
|
fd_soft, fd_max = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
except (ImportError, ValueError):
|
|
pass
|
|
|
|
ib_devs = []
|
|
if os.path.isdir("/sys/class/infiniband"):
|
|
ib_devs = os.listdir("/sys/class/infiniband")
|
|
|
|
rdma_devs = []
|
|
if os.path.isdir("/sys/class/infiniband_verbs"):
|
|
rdma_devs = os.listdir("/sys/class/infiniband_verbs")
|
|
|
|
nccl_env = {k: v for k, v in os.environ.items() if k.startswith("NCCL_")}
|
|
|
|
return {
|
|
"nvidia_persistenced": {"installed": persistd, "running": persistd_running},
|
|
"hugepages": {"configured": hugepages > 0, "count": hugepages},
|
|
"swap": {"enabled": swap_status},
|
|
"transparent_hugepage": thp,
|
|
"file_descriptors": {"soft": fd_soft, "max": fd_max},
|
|
"infiniband_devices": ib_devs,
|
|
"rdma_devices": rdma_devs,
|
|
"nccl_env_vars": nccl_env,
|
|
}
|
|
|
|
@staticmethod
|
|
def print_results(results: dict, console: Console = None):
|
|
c = console or Console()
|
|
if "error" in results:
|
|
c.print(f"[bold red]Error: {results['error']}[/bold red]")
|
|
return
|
|
|
|
passed = results.get("passed", False)
|
|
verdict = "[bold green]✓ ALL CHECKS PASSED[/bold green]" if passed else "[bold red]✗ SOME CHECKS FAILED[/bold red]"
|
|
c.print(Panel(verdict, border_style="green" if passed else "red"))
|
|
|
|
gpu_health = results.get("gpu_health", [])
|
|
if gpu_health:
|
|
table = Table(title="GPU Health Checks", box=None, padding=(0, 1))
|
|
table.add_column("GPU", style="bold", width=5)
|
|
table.add_column("Temp", width=10)
|
|
table.add_column("Power", width=12)
|
|
table.add_column("ECC", width=10)
|
|
table.add_column("PCIe", width=10)
|
|
table.add_column("Clock", width=8)
|
|
table.add_column("Throttle", width=10)
|
|
table.add_column("Persist", width=8)
|
|
table.add_column("Status", width=7)
|
|
|
|
for g in gpu_health:
|
|
ch = g["checks"]
|
|
status_color = "green" if g["status"] == "PASS" else ("yellow" if g["status"] == "WARN" else "red")
|
|
status_text = f"[{status_color}]{g['status']}[/{status_color}]"
|
|
|
|
def status_icon(s):
|
|
return {"PASS": "[green]✓[/green]", "WARN": "[yellow]![/yellow]", "FAIL": "[red]✗[/red]"}.get(s, s)
|
|
|
|
temp = f"{ch['temperature']['value']}°C {status_icon(ch['temperature']['status'])}"
|
|
pw = f"{ch['power']['value']:.0f}W {status_icon(ch['power']['status'])}"
|
|
ecc = f"S:{ch['ecc_errors']['single']} D:{ch['ecc_errors']['double']} {status_icon(ch['ecc_errors']['status'])}"
|
|
pcie = f"Gen{ch['pcie_link']['gen']}x{ch['pcie_link']['width']} {status_icon(ch['pcie_link']['status'])}"
|
|
clk = f"{ch['clock_speed']['sm']}MHz {status_icon(ch['clock_speed']['status'])}"
|
|
thr = status_icon(ch["throttling"]["status"])
|
|
pers = status_icon(ch["persistence_mode"]["status"])
|
|
|
|
table.add_row(str(g["index"]), temp, pw, ecc, pcie, clk, thr, pers, status_text)
|
|
c.print(table)
|
|
|
|
sys_h = results.get("system_health", {})
|
|
if sys_h:
|
|
c.print("\n[bold cyan]System Health[/bold cyan]")
|
|
np = sys_h.get("nvidia_persistenced", {})
|
|
np_status = "[green]Running[/green]" if np.get("running") else "[red]Not running[/red]"
|
|
if not np.get("installed"):
|
|
np_status = "[yellow]Not installed[/yellow]"
|
|
c.print(f" nvidia-persistenced : {np_status}")
|
|
|
|
hp = sys_h.get("hugepages", {})
|
|
hp_status = "[green]Configured[/green]" if hp.get("configured") else "[yellow]Not configured[/yellow]"
|
|
c.print(f" Hugepages : {hp_status} ({hp.get('count', 0)} pages)")
|
|
|
|
swap = sys_h.get("swap", {})
|
|
swap_txt = "[red]Enabled[/red]" if swap.get("enabled") else "[green]Disabled[/green]"
|
|
c.print(f" Swap : {swap_txt}")
|
|
|
|
thp = sys_h.get("transparent_hugepage", "unknown")
|
|
thp_color = "green" if thp in ("always", "madvise") else "yellow"
|
|
c.print(f" Transparent HP : [{thp_color}]{thp}[/{thp_color}]")
|
|
|
|
fd = sys_h.get("file_descriptors", {})
|
|
fd_ok = fd.get("soft", 0) >= 65536
|
|
fd_color = "green" if fd_ok else "yellow"
|
|
c.print(f" File Descriptors : [{fd_color}]{fd.get('soft', 'N/A')} (soft) / {fd.get('max', 'N/A')} (max)[/{fd_color}]")
|
|
|
|
ib = sys_h.get("infiniband_devices", [])
|
|
rdma = sys_h.get("rdma_devices", [])
|
|
if ib:
|
|
c.print(f" InfiniBand : [green]{', '.join(ib)}[/green]")
|
|
else:
|
|
c.print(" InfiniBand : [yellow]No devices detected[/yellow]")
|
|
if rdma:
|
|
c.print(f" RDMA : [green]{', '.join(rdma)}[/green]")
|
|
|
|
nccl = sys_h.get("nccl_env_vars", {})
|
|
if nccl:
|
|
c.print(" NCCL Env Vars:")
|
|
for k, v in sorted(nccl.items()):
|
|
c.print(f" {k}={v}")
|
|
else:
|
|
c.print(" NCCL Env Vars : [yellow]None set[/yellow]")
|