From 1db1313d50f69337b18ab11f01dd5ab6f93c8c2d Mon Sep 17 00:00:00 2001 From: "hongshuai.dong" Date: Sun, 10 May 2026 17:22:43 +0800 Subject: [PATCH] fix(health): mark PCIe link downgrade as FAIL instead of WARN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PCIe link health check was producing inconsistent verdicts: when the negotiated link did not meet the GPU's expected Gen/Width (e.g. an H200 running at Gen4 instead of Gen5, or any GPU dropping below x16), the code correctly flipped overall_pass to False — but recorded the per-GPU status as "WARN" rather than "FAIL". This mismatch broke the convention used by every other check in the module (temperature, ECC, throttling), where FAIL is the only status that drives overall_pass=False, and WARN is purely informational. As a result the rendered Markdown / table output would show a yellow WARN badge for the affected GPU while the overall Health Check verdict came back red FAIL, leaving operators to wonder which signal to trust. A PCIe link downgrade is not a soft warning — it halves H2D/D2H bandwidth (Gen5 x16 ~64 GB/s -> Gen4 x16 ~32 GB/s), directly impacting data loading, checkpoint I/O, and ZeRO/offload throughput. For an acceptance-test tool this should be a hard failure, consistent with how overall_pass already treats it. Change: in modules/health_check.py, set status to "FAIL" (not "WARN") when pcie_ok is False. This applies to both the known-GPU path (Gen >= expected and Width >= 16) and the unknown-GPU fallback path (Width >= 8). No behavioral change to overall_pass — only the per-GPU status string is corrected so the table view, Markdown report, and the overall verdict now agree. --- modules/health_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/health_check.py b/modules/health_check.py index dd64071..8dbfa7b 100644 --- a/modules/health_check.py +++ b/modules/health_check.py @@ -122,7 +122,7 @@ class HealthCheck: pcie_ok = pw >= 8 # unknown GPU: just check width if not pcie_ok: overall_pass = False - checks["pcie_link"] = {"gen": pg, "width": pw, "status": "PASS" if pcie_ok else "WARN"} + checks["pcie_link"] = {"gen": pg, "width": pw, "status": "PASS" if pcie_ok else "FAIL"} sm = self._safe_int(clock_sms[i] if i < len(clock_sms) else 0) mm = self._safe_int(clock_mems[i] if i < len(clock_mems) else 0) -- 2.47.2