feat: 按 H100 生产验收标准更新测试指标与判定逻辑

- gpu_specs: H100 新增 compute_pass_thresholds_tflops 字段 (fp32:54 / tf32:444 / fp16:734 / bf16:745 / fp8:1400)，与 marketing peak 解耦，作为绝对 TFLOPS PASS 门槛 - benchmark: compute 结果中透出 pass_thresholds_tflops 供 report 使用 - report: compute 判定改用绝对 TFLOPS (PASS ≥门槛 / WARN ≥门槛×90% / FAIL <门槛×90%)；表头切换为 Threshold 列；Memory D2D verdict 由 50/30 收紧至 80/60；无阈值配置的 GPU 保留旧 % 效率逻辑 - nccl: _OP_BW_FRACTIONS 收紧至 AllReduce/AllGather/ReduceScatter 0.45、Broadcast/SendRecv 0.40、AllToAll 0.35，与验收文档 §5 一致 - configs: benchmark 默认 matrix_size 4096→8192、warmup 10→50、 iterations 100→500、use_compile 改 true；health temp_warning 80→75、temp_critical 90→85，匹配生产验收稳态温度要求 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-13 14:52:41 +08:00 · 2026-05-13 14:52:41 +08:00 · fc97a768cf
commit fc97a768cf
parent 375d439abb
5 changed files with 99 additions and 31 deletions
--- a/configs/default.yaml
+++ b/configs/default.yaml
@ -14,14 +14,14 @@ benchmark:
      - fp16
      - bf16
      - fp8
-    matrix_size: 4096
+    matrix_size: 8192
-    warmup: 10
+    warmup: 50
-    iterations: 100
+    iterations: 500
-    use_compile: false
+    use_compile: true
 health:
-  temp_warning: 80
+  temp_warning: 75
-  temp_critical: 90
+  temp_critical: 85
  power_limit: null  # null = auto-detect from GPU TDP per gpu_specs.py
 nccl:
--- a/modules/benchmark.py
+++ b/modules/benchmark.py
@ -469,6 +469,12 @@ class Benchmark:
                "per_dtype_tflops": results_by_dtype,
                "peak_tflops": {dt: dtype_map[dt][1] for dt in dtype_map},
                "efficiency_pct": efficiency,
                # Absolute TFLOPS PASS thresholds (decoupled from peak). When present,
                # report.py judges PASS/WARN/FAIL against these directly instead of
                # using % of peak. Empty dict => fall back to legacy 80% rule.
                "pass_thresholds_tflops": dict(
                    self.specs.get("compute_pass_thresholds_tflops") or {}
                ),
                "per_gpu": per_gpu_results,
                "matrix_size": matrix_size,
                "warmup": warmup,
--- a/modules/gpu_specs.py
+++ b/modules/gpu_specs.py
@ -20,6 +20,10 @@ GPU_NAME_PATTERNS = {
 # Specs database — ALL values are DENSE (non-sparse) TFLOPS
 GPU_SPECS = {
    "h100": {
        # Peaks below are NVIDIA marketing dense peaks (theoretical Tensor Core max).
        # `compute_pass_thresholds_tflops` carries the absolute PASS thresholds used
        # by report.py — decoupled from peaks so marketing-spec changes (dense vs
        # sparse vs FP8-sparsity) don't shift the validation bar.
        "full_name": "NVIDIA H100 SXM5",
        "architecture": "Hopper",
        "compute_capability": 9.0,
@ -31,6 +35,11 @@ GPU_SPECS = {
        "fp16_tflops": 990,                 # dense (1979 sparse w/ 2:4)
        "bf16_tflops": 990,                 # dense
        "fp8_tflops": 1979,                 # dense
        "compute_pass_thresholds_tflops": {
            "fp32": 54, "tf32": 444, "fp16": 734, "bf16": 745, "fp8": 1400,
            # FP64 63 / INT8 1536 — listed for documentation; benchmark module
            # doesn't currently exercise these dtypes.
        },
        "tdp_watts": 700,
        "nvlink_gen": 4,
        "nvlink_bandwidth_gbps": 900,       # bidirectional
@ -171,6 +180,7 @@ _UNKNOWN_SPECS = {
    "fp16_tflops": 0,
    "bf16_tflops": 0,
    "fp8_tflops": 0,
    "compute_pass_thresholds_tflops": {},  # empty => report.py falls back to 80% of peak
    "tdp_watts": 700,
    "nvlink_gen": 0,
    "nvlink_bandwidth_gbps": 0,
--- a/modules/nccl_test.py
+++ b/modules/nccl_test.py
@ -24,18 +24,16 @@ except ImportError:
 # Per-operation bandwidth thresholds, as a fraction of NVLink bidirectional BW.
-# AllReduce uses ring algorithm and saturates ring BW; AllToAll requires full-mesh
+# Values aligned with the H100 production acceptance criteria (acceptance doc §5).
-# transfers and on 8-GPU NVSwitch typically runs 10-20% lower than AllReduce.
+# AllToAll runs ~10-20% lower than AllReduce on 8-GPU NVSwitch, so its fraction is
-# Public H100/H200 8-GPU benchmarks show AllToAll bus BW in the 300-380 GB/s range
+# set lower; broadcast/sendrecv sit between.
 # vs AllReduce in 400-500 GB/s. Using a single 40% threshold for both produced
 # false positives for AllToAll.
 _OP_BW_FRACTIONS = {
-    "allreduce":     0.40,
+    "allreduce":     0.45,
-    "alltoall":      0.30,
+    "allgather":     0.45,
-    "broadcast":     0.35,
+    "reducescatter": 0.45,
-    "reducescatter": 0.38,
+    "broadcast":     0.40,
-    "allgather":     0.38,
+    "sendrecv":      0.40,
-    "sendrecv":      0.35,
+    "alltoall":      0.35,
 }
@ -105,7 +103,7 @@ class NCCLTest:
                return float(user_override)
            if nvlink_bw <= 0:
                return 10.0  # conservative floor
-            frac = _OP_BW_FRACTIONS.get(label.lower(), 0.40)
+            frac = _OP_BW_FRACTIONS.get(label.lower(), 0.45)
            return round(nvlink_bw * frac)
        if self.gpu_type == "unknown":
--- a/modules/report.py
+++ b/modules/report.py
@ -283,7 +283,8 @@ class ReportGenerator:
                    "nvbandwidth unavailable — figure is indicative only, not a true HBM peak)\n"
                )
            else:
-                verdict = "PASS" if d2d_eff >= 50 else ("WARN" if d2d_eff >= 30 else "FAIL")
+                # Tightened to match production acceptance: PASS >= 80%, WARN 60–80%, FAIL < 60%.
                verdict = "PASS" if d2d_eff >= 80 else ("WARN" if d2d_eff >= 60 else "FAIL")
                lines.append(f"**Verdict: {verdict}** (D2D efficiency {d2d_eff:.1f}%)\n")
        # --- Compute Throughput ---
@ -293,9 +294,18 @@ class ReportGenerator:
            per_dtype = comp_data.get("per_dtype_tflops", {})
            peak_tflops = comp_data.get("peak_tflops", {})
            eff_pct = comp_data.get("efficiency_pct", {})
-            lines.append("| DType | Achieved (TFLOPS) | Peak | Efficiency | Status |")
+            # Absolute PASS thresholds (TFLOPS) from gpu_specs.compute_pass_thresholds_tflops.
            # When present, override the legacy 80%-of-peak rule on a per-dtype basis.
            pass_thresholds = comp_data.get("pass_thresholds_tflops", {}) or {}
            use_abs = bool(pass_thresholds)
            if use_abs:
                lines.append("| DType | Achieved (TFLOPS) | Peak | Threshold | Status |")
            else:
                lines.append("| DType | Achieved (TFLOPS) | Peak | Efficiency | Status |")
            lines.append("|-------|-------------------|------|------------|--------|")
            worst_eff = 100.0
            overall_status = "PASS"
            rank = {"PASS": 0, "WARN": 1, "FAIL": 2, "SKIP": 0}
            for dt, val in per_dtype.items():
                if isinstance(val, str):
                    # skipped or error
@ -305,11 +315,26 @@ class ReportGenerator:
                    ef = eff_pct.get(dt, 0)
                    if isinstance(ef, (int, float)) and ef > 0:
                        worst_eff = min(worst_eff, ef)
-                    status = "PASS" if ef >= 80 else ("WARN" if ef >= 50 else "FAIL")
+                    thr = pass_thresholds.get(dt)
-                    lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | {ef:.1f}% | {status} |")
+                    if use_abs and thr:
                        if val >= thr:
                            status = "PASS"
                        elif val >= thr * 0.9:
                            status = "WARN"
                        else:
                            status = "FAIL"
                        lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | >= {thr} | {status} |")
                    else:
                        status = "PASS" if ef >= 80 else ("WARN" if ef >= 50 else "FAIL")
                        lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | {ef:.1f}% | {status} |")
                    if rank.get(status, 0) > rank.get(overall_status, 0):
                        overall_status = status
            lines.append("")
-            overall = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL")
+            if use_abs:
-            lines.append(f"**Verdict: {overall}** (worst efficiency {worst_eff:.1f}%)\n")
+                lines.append(f"**Verdict: {overall_status}** (absolute TFLOPS thresholds; worst efficiency {worst_eff:.1f}%)\n")
            else:
                overall_status = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL")
                lines.append(f"**Verdict: {overall_status}** (worst efficiency {worst_eff:.1f}%)\n")
        # --- NCCL ---
        nccl = results.get("nccl")
@ -449,7 +474,7 @@ class ReportGenerator:
                items.append(("Memory Bandwidth", f"WARN ({d2d:.0f} GB/s via PyTorch fallback)"))
            else:
                eff = mem.get("efficiency_pct") or 0
-                verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 50 else "FAIL")
+                verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 60 else "FAIL")
                items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))
        # Compute
@ -458,14 +483,43 @@ class ReportGenerator:
            if comp.get("error"):
                items.append(("Compute Throughput", f"ERROR: {comp['error']}"))
            else:
                per_dtype = comp.get("per_dtype_tflops", {})
                eff_pct = comp.get("efficiency_pct", {})
-                valid_effs = [v for v in eff_pct.values() if isinstance(v, (int, float)) and v > 0]
+                pass_thresholds = comp.get("pass_thresholds_tflops", {}) or {}
-                if valid_effs:
+                if pass_thresholds:
-                    worst = min(valid_effs)
+                    # Absolute TFLOPS judgment, mirroring the per-dtype table above.
-                    verdict = "PASS" if worst >= 80 else ("WARN" if worst >= 50 else "FAIL")
+                    rank = {"PASS": 0, "WARN": 1, "FAIL": 2}
-                    items.append(("Compute Throughput", f"{verdict} (worst {worst:.1f}%)"))
+                    worst_status = "PASS"
                    worst_dt = None
                    for dt, thr in pass_thresholds.items():
                        val = per_dtype.get(dt)
                        if not isinstance(val, (int, float)):
                            continue
                        if val >= thr:
                            st = "PASS"
                        elif val >= thr * 0.9:
                            st = "WARN"
                        else:
                            st = "FAIL"
                        if rank[st] > rank[worst_status]:
                            worst_status = st
                            worst_dt = dt
                    if worst_dt:
                        items.append((
                            "Compute Throughput",
                            f"{worst_status} (worst {worst_dt.upper()} "
                            f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})"
                        ))
                    else:
                        items.append(("Compute Throughput", f"{worst_status}"))
                else:
-                    items.append(("Compute Throughput", "N/A"))
+                    valid_effs = [v for v in eff_pct.values() if isinstance(v, (int, float)) and v > 0]
                    if valid_effs:
                        worst = min(valid_effs)
                        verdict = "PASS" if worst >= 80 else ("WARN" if worst >= 50 else "FAIL")
                        items.append(("Compute Throughput", f"{verdict} (worst {worst:.1f}%)"))
                    else:
                        items.append(("Compute Throughput", "N/A"))
        # NCCL
        if "nccl" in results: