From fc97a768cf0a75d521b65690394e5b7251e3c616 Mon Sep 17 00:00:00 2001
From: zulifeng <lifeng.zu@d-robotics.cc>
Date: Wed, 13 May 2026 14:52:41 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E6=8C=89=20H100=20=E7=94=9F=E4=BA=A7?=
 =?UTF-8?q?=E9=AA=8C=E6=94=B6=E6=A0=87=E5=87=86=E6=9B=B4=E6=96=B0=E6=B5=8B?=
 =?UTF-8?q?=E8=AF=95=E6=8C=87=E6=A0=87=E4=B8=8E=E5=88=A4=E5=AE=9A=E9=80=BB?=
 =?UTF-8?q?=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- gpu_specs: H100 新增 compute_pass_thresholds_tflops 字段
  (fp32:54 / tf32:444 / fp16:734 / bf16:745 / fp8:1400)，
  与 marketing peak 解耦，作为绝对 TFLOPS PASS 门槛
- benchmark: compute 结果中透出 pass_thresholds_tflops 供 report 使用
- report: compute 判定改用绝对 TFLOPS (PASS ≥门槛 / WARN ≥门槛×90% /
  FAIL <门槛×90%)；表头切换为 Threshold 列；Memory D2D verdict
  由 50/30 收紧至 80/60；无阈值配置的 GPU 保留旧 % 效率逻辑
- nccl: _OP_BW_FRACTIONS 收紧至 AllReduce/AllGather/ReduceScatter
  0.45、Broadcast/SendRecv 0.40、AllToAll 0.35，与验收文档 §5 一致
- configs: benchmark 默认 matrix_size 4096→8192、warmup 10→50、
  iterations 100→500、use_compile 改 true；health temp_warning
  80→75、temp_critical 90→85，匹配生产验收稳态温度要求

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 configs/default.yaml | 12 +++----
 modules/benchmark.py |  6 ++++
 modules/gpu_specs.py | 10 ++++++
 modules/nccl_test.py | 22 ++++++------
 modules/report.py    | 80 +++++++++++++++++++++++++++++++++++++-------
 5 files changed, 99 insertions(+), 31 deletions(-)

diff --git a/configs/default.yaml b/configs/default.yaml
index 4a13313..4de648b 100644
--- a/configs/default.yaml
+++ b/configs/default.yaml
@@ -14,14 +14,14 @@ benchmark:
       - fp16
       - bf16
       - fp8
-    matrix_size: 4096
-    warmup: 10
-    iterations: 100
-    use_compile: false
+    matrix_size: 8192
+    warmup: 50
+    iterations: 500
+    use_compile: true
 
 health:
-  temp_warning: 80
-  temp_critical: 90
+  temp_warning: 75
+  temp_critical: 85
   power_limit: null  # null = auto-detect from GPU TDP per gpu_specs.py
 
 nccl:
diff --git a/modules/benchmark.py b/modules/benchmark.py
index e58a8f0..856d0cd 100644
--- a/modules/benchmark.py
+++ b/modules/benchmark.py
@@ -469,6 +469,12 @@ class Benchmark:
                 "per_dtype_tflops": results_by_dtype,
                 "peak_tflops": {dt: dtype_map[dt][1] for dt in dtype_map},
                 "efficiency_pct": efficiency,
+                # Absolute TFLOPS PASS thresholds (decoupled from peak). When present,
+                # report.py judges PASS/WARN/FAIL against these directly instead of
+                # using % of peak. Empty dict => fall back to legacy 80% rule.
+                "pass_thresholds_tflops": dict(
+                    self.specs.get("compute_pass_thresholds_tflops") or {}
+                ),
                 "per_gpu": per_gpu_results,
                 "matrix_size": matrix_size,
                 "warmup": warmup,
diff --git a/modules/gpu_specs.py b/modules/gpu_specs.py
index db3ca30..afba93d 100644
--- a/modules/gpu_specs.py
+++ b/modules/gpu_specs.py
@@ -20,6 +20,10 @@ GPU_NAME_PATTERNS = {
 # Specs database — ALL values are DENSE (non-sparse) TFLOPS
 GPU_SPECS = {
     "h100": {
+        # Peaks below are NVIDIA marketing dense peaks (theoretical Tensor Core max).
+        # `compute_pass_thresholds_tflops` carries the absolute PASS thresholds used
+        # by report.py — decoupled from peaks so marketing-spec changes (dense vs
+        # sparse vs FP8-sparsity) don't shift the validation bar.
         "full_name": "NVIDIA H100 SXM5",
         "architecture": "Hopper",
         "compute_capability": 9.0,
@@ -31,6 +35,11 @@ GPU_SPECS = {
         "fp16_tflops": 990,                 # dense (1979 sparse w/ 2:4)
         "bf16_tflops": 990,                 # dense
         "fp8_tflops": 1979,                 # dense
+        "compute_pass_thresholds_tflops": {
+            "fp32": 54, "tf32": 444, "fp16": 734, "bf16": 745, "fp8": 1400,
+            # FP64 63 / INT8 1536 — listed for documentation; benchmark module
+            # doesn't currently exercise these dtypes.
+        },
         "tdp_watts": 700,
         "nvlink_gen": 4,
         "nvlink_bandwidth_gbps": 900,       # bidirectional
@@ -171,6 +180,7 @@ _UNKNOWN_SPECS = {
     "fp16_tflops": 0,
     "bf16_tflops": 0,
     "fp8_tflops": 0,
+    "compute_pass_thresholds_tflops": {},  # empty => report.py falls back to 80% of peak
     "tdp_watts": 700,
     "nvlink_gen": 0,
     "nvlink_bandwidth_gbps": 0,
diff --git a/modules/nccl_test.py b/modules/nccl_test.py
index 7435577..fd9ab6a 100644
--- a/modules/nccl_test.py
+++ b/modules/nccl_test.py
@@ -24,18 +24,16 @@ except ImportError:
 
 
 # Per-operation bandwidth thresholds, as a fraction of NVLink bidirectional BW.
-# AllReduce uses ring algorithm and saturates ring BW; AllToAll requires full-mesh
-# transfers and on 8-GPU NVSwitch typically runs 10-20% lower than AllReduce.
-# Public H100/H200 8-GPU benchmarks show AllToAll bus BW in the 300-380 GB/s range
-# vs AllReduce in 400-500 GB/s. Using a single 40% threshold for both produced
-# false positives for AllToAll.
+# Values aligned with the H100 production acceptance criteria (acceptance doc §5).
+# AllToAll runs ~10-20% lower than AllReduce on 8-GPU NVSwitch, so its fraction is
+# set lower; broadcast/sendrecv sit between.
 _OP_BW_FRACTIONS = {
-    "allreduce":     0.40,
-    "alltoall":      0.30,
-    "broadcast":     0.35,
-    "reducescatter": 0.38,
-    "allgather":     0.38,
-    "sendrecv":      0.35,
+    "allreduce":     0.45,
+    "allgather":     0.45,
+    "reducescatter": 0.45,
+    "broadcast":     0.40,
+    "sendrecv":      0.40,
+    "alltoall":      0.35,
 }
 
 
@@ -105,7 +103,7 @@ class NCCLTest:
                 return float(user_override)
             if nvlink_bw <= 0:
                 return 10.0  # conservative floor
-            frac = _OP_BW_FRACTIONS.get(label.lower(), 0.40)
+            frac = _OP_BW_FRACTIONS.get(label.lower(), 0.45)
             return round(nvlink_bw * frac)
 
         if self.gpu_type == "unknown":
diff --git a/modules/report.py b/modules/report.py
index c8248cd..d9e1eba 100644
--- a/modules/report.py
+++ b/modules/report.py
@@ -283,7 +283,8 @@ class ReportGenerator:
                     "nvbandwidth unavailable — figure is indicative only, not a true HBM peak)\n"
                 )
             else:
-                verdict = "PASS" if d2d_eff >= 50 else ("WARN" if d2d_eff >= 30 else "FAIL")
+                # Tightened to match production acceptance: PASS >= 80%, WARN 60–80%, FAIL < 60%.
+                verdict = "PASS" if d2d_eff >= 80 else ("WARN" if d2d_eff >= 60 else "FAIL")
                 lines.append(f"**Verdict: {verdict}** (D2D efficiency {d2d_eff:.1f}%)\n")
 
         # --- Compute Throughput ---
@@ -293,9 +294,18 @@ class ReportGenerator:
             per_dtype = comp_data.get("per_dtype_tflops", {})
             peak_tflops = comp_data.get("peak_tflops", {})
             eff_pct = comp_data.get("efficiency_pct", {})
-            lines.append("| DType | Achieved (TFLOPS) | Peak | Efficiency | Status |")
+            # Absolute PASS thresholds (TFLOPS) from gpu_specs.compute_pass_thresholds_tflops.
+            # When present, override the legacy 80%-of-peak rule on a per-dtype basis.
+            pass_thresholds = comp_data.get("pass_thresholds_tflops", {}) or {}
+            use_abs = bool(pass_thresholds)
+            if use_abs:
+                lines.append("| DType | Achieved (TFLOPS) | Peak | Threshold | Status |")
+            else:
+                lines.append("| DType | Achieved (TFLOPS) | Peak | Efficiency | Status |")
             lines.append("|-------|-------------------|------|------------|--------|")
             worst_eff = 100.0
+            overall_status = "PASS"
+            rank = {"PASS": 0, "WARN": 1, "FAIL": 2, "SKIP": 0}
             for dt, val in per_dtype.items():
                 if isinstance(val, str):
                     # skipped or error
@@ -305,11 +315,26 @@ class ReportGenerator:
                     ef = eff_pct.get(dt, 0)
                     if isinstance(ef, (int, float)) and ef > 0:
                         worst_eff = min(worst_eff, ef)
-                    status = "PASS" if ef >= 80 else ("WARN" if ef >= 50 else "FAIL")
-                    lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | {ef:.1f}% | {status} |")
+                    thr = pass_thresholds.get(dt)
+                    if use_abs and thr:
+                        if val >= thr:
+                            status = "PASS"
+                        elif val >= thr * 0.9:
+                            status = "WARN"
+                        else:
+                            status = "FAIL"
+                        lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | >= {thr} | {status} |")
+                    else:
+                        status = "PASS" if ef >= 80 else ("WARN" if ef >= 50 else "FAIL")
+                        lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | {ef:.1f}% | {status} |")
+                    if rank.get(status, 0) > rank.get(overall_status, 0):
+                        overall_status = status
             lines.append("")
-            overall = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL")
-            lines.append(f"**Verdict: {overall}** (worst efficiency {worst_eff:.1f}%)\n")
+            if use_abs:
+                lines.append(f"**Verdict: {overall_status}** (absolute TFLOPS thresholds; worst efficiency {worst_eff:.1f}%)\n")
+            else:
+                overall_status = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL")
+                lines.append(f"**Verdict: {overall_status}** (worst efficiency {worst_eff:.1f}%)\n")
 
         # --- NCCL ---
         nccl = results.get("nccl")
@@ -449,7 +474,7 @@ class ReportGenerator:
                 items.append(("Memory Bandwidth", f"WARN ({d2d:.0f} GB/s via PyTorch fallback)"))
             else:
                 eff = mem.get("efficiency_pct") or 0
-                verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 50 else "FAIL")
+                verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 60 else "FAIL")
                 items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))
 
         # Compute
@@ -458,14 +483,43 @@ class ReportGenerator:
             if comp.get("error"):
                 items.append(("Compute Throughput", f"ERROR: {comp['error']}"))
             else:
+                per_dtype = comp.get("per_dtype_tflops", {})
                 eff_pct = comp.get("efficiency_pct", {})
-                valid_effs = [v for v in eff_pct.values() if isinstance(v, (int, float)) and v > 0]
-                if valid_effs:
-                    worst = min(valid_effs)
-                    verdict = "PASS" if worst >= 80 else ("WARN" if worst >= 50 else "FAIL")
-                    items.append(("Compute Throughput", f"{verdict} (worst {worst:.1f}%)"))
+                pass_thresholds = comp.get("pass_thresholds_tflops", {}) or {}
+                if pass_thresholds:
+                    # Absolute TFLOPS judgment, mirroring the per-dtype table above.
+                    rank = {"PASS": 0, "WARN": 1, "FAIL": 2}
+                    worst_status = "PASS"
+                    worst_dt = None
+                    for dt, thr in pass_thresholds.items():
+                        val = per_dtype.get(dt)
+                        if not isinstance(val, (int, float)):
+                            continue
+                        if val >= thr:
+                            st = "PASS"
+                        elif val >= thr * 0.9:
+                            st = "WARN"
+                        else:
+                            st = "FAIL"
+                        if rank[st] > rank[worst_status]:
+                            worst_status = st
+                            worst_dt = dt
+                    if worst_dt:
+                        items.append((
+                            "Compute Throughput",
+                            f"{worst_status} (worst {worst_dt.upper()} "
+                            f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})"
+                        ))
+                    else:
+                        items.append(("Compute Throughput", f"{worst_status}"))
                 else:
-                    items.append(("Compute Throughput", "N/A"))
+                    valid_effs = [v for v in eff_pct.values() if isinstance(v, (int, float)) and v > 0]
+                    if valid_effs:
+                        worst = min(valid_effs)
+                        verdict = "PASS" if worst >= 80 else ("WARN" if worst >= 50 else "FAIL")
+                        items.append(("Compute Throughput", f"{verdict} (worst {worst:.1f}%)"))
+                    else:
+                        items.append(("Compute Throughput", "N/A"))
 
         # NCCL
         if "nccl" in results: