From fc97a768cf0a75d521b65690394e5b7251e3c616 Mon Sep 17 00:00:00 2001 From: zulifeng Date: Wed, 13 May 2026 14:52:41 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=8C=89=20H100=20=E7=94=9F=E4=BA=A7?= =?UTF-8?q?=E9=AA=8C=E6=94=B6=E6=A0=87=E5=87=86=E6=9B=B4=E6=96=B0=E6=B5=8B?= =?UTF-8?q?=E8=AF=95=E6=8C=87=E6=A0=87=E4=B8=8E=E5=88=A4=E5=AE=9A=E9=80=BB?= =?UTF-8?q?=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - gpu_specs: H100 新增 compute_pass_thresholds_tflops 字段 (fp32:54 / tf32:444 / fp16:734 / bf16:745 / fp8:1400), 与 marketing peak 解耦,作为绝对 TFLOPS PASS 门槛 - benchmark: compute 结果中透出 pass_thresholds_tflops 供 report 使用 - report: compute 判定改用绝对 TFLOPS (PASS ≥门槛 / WARN ≥门槛×90% / FAIL <门槛×90%);表头切换为 Threshold 列;Memory D2D verdict 由 50/30 收紧至 80/60;无阈值配置的 GPU 保留旧 % 效率逻辑 - nccl: _OP_BW_FRACTIONS 收紧至 AllReduce/AllGather/ReduceScatter 0.45、Broadcast/SendRecv 0.40、AllToAll 0.35,与验收文档 §5 一致 - configs: benchmark 默认 matrix_size 4096→8192、warmup 10→50、 iterations 100→500、use_compile 改 true;health temp_warning 80→75、temp_critical 90→85,匹配生产验收稳态温度要求 Co-Authored-By: Claude Opus 4.7 --- configs/default.yaml | 12 +++---- modules/benchmark.py | 6 ++++ modules/gpu_specs.py | 10 ++++++ modules/nccl_test.py | 22 ++++++------ modules/report.py | 80 +++++++++++++++++++++++++++++++++++++------- 5 files changed, 99 insertions(+), 31 deletions(-) diff --git a/configs/default.yaml b/configs/default.yaml index 4a13313..4de648b 100644 --- a/configs/default.yaml +++ b/configs/default.yaml @@ -14,14 +14,14 @@ benchmark: - fp16 - bf16 - fp8 - matrix_size: 4096 - warmup: 10 - iterations: 100 - use_compile: false + matrix_size: 8192 + warmup: 50 + iterations: 500 + use_compile: true health: - temp_warning: 80 - temp_critical: 90 + temp_warning: 75 + temp_critical: 85 power_limit: null # null = auto-detect from GPU TDP per gpu_specs.py nccl: diff --git a/modules/benchmark.py b/modules/benchmark.py index e58a8f0..856d0cd 100644 --- a/modules/benchmark.py +++ b/modules/benchmark.py @@ -469,6 +469,12 @@ class Benchmark: "per_dtype_tflops": results_by_dtype, "peak_tflops": {dt: dtype_map[dt][1] for dt in dtype_map}, "efficiency_pct": efficiency, + # Absolute TFLOPS PASS thresholds (decoupled from peak). When present, + # report.py judges PASS/WARN/FAIL against these directly instead of + # using % of peak. Empty dict => fall back to legacy 80% rule. + "pass_thresholds_tflops": dict( + self.specs.get("compute_pass_thresholds_tflops") or {} + ), "per_gpu": per_gpu_results, "matrix_size": matrix_size, "warmup": warmup, diff --git a/modules/gpu_specs.py b/modules/gpu_specs.py index db3ca30..afba93d 100644 --- a/modules/gpu_specs.py +++ b/modules/gpu_specs.py @@ -20,6 +20,10 @@ GPU_NAME_PATTERNS = { # Specs database — ALL values are DENSE (non-sparse) TFLOPS GPU_SPECS = { "h100": { + # Peaks below are NVIDIA marketing dense peaks (theoretical Tensor Core max). + # `compute_pass_thresholds_tflops` carries the absolute PASS thresholds used + # by report.py — decoupled from peaks so marketing-spec changes (dense vs + # sparse vs FP8-sparsity) don't shift the validation bar. "full_name": "NVIDIA H100 SXM5", "architecture": "Hopper", "compute_capability": 9.0, @@ -31,6 +35,11 @@ GPU_SPECS = { "fp16_tflops": 990, # dense (1979 sparse w/ 2:4) "bf16_tflops": 990, # dense "fp8_tflops": 1979, # dense + "compute_pass_thresholds_tflops": { + "fp32": 54, "tf32": 444, "fp16": 734, "bf16": 745, "fp8": 1400, + # FP64 63 / INT8 1536 — listed for documentation; benchmark module + # doesn't currently exercise these dtypes. + }, "tdp_watts": 700, "nvlink_gen": 4, "nvlink_bandwidth_gbps": 900, # bidirectional @@ -171,6 +180,7 @@ _UNKNOWN_SPECS = { "fp16_tflops": 0, "bf16_tflops": 0, "fp8_tflops": 0, + "compute_pass_thresholds_tflops": {}, # empty => report.py falls back to 80% of peak "tdp_watts": 700, "nvlink_gen": 0, "nvlink_bandwidth_gbps": 0, diff --git a/modules/nccl_test.py b/modules/nccl_test.py index 7435577..fd9ab6a 100644 --- a/modules/nccl_test.py +++ b/modules/nccl_test.py @@ -24,18 +24,16 @@ except ImportError: # Per-operation bandwidth thresholds, as a fraction of NVLink bidirectional BW. -# AllReduce uses ring algorithm and saturates ring BW; AllToAll requires full-mesh -# transfers and on 8-GPU NVSwitch typically runs 10-20% lower than AllReduce. -# Public H100/H200 8-GPU benchmarks show AllToAll bus BW in the 300-380 GB/s range -# vs AllReduce in 400-500 GB/s. Using a single 40% threshold for both produced -# false positives for AllToAll. +# Values aligned with the H100 production acceptance criteria (acceptance doc §5). +# AllToAll runs ~10-20% lower than AllReduce on 8-GPU NVSwitch, so its fraction is +# set lower; broadcast/sendrecv sit between. _OP_BW_FRACTIONS = { - "allreduce": 0.40, - "alltoall": 0.30, - "broadcast": 0.35, - "reducescatter": 0.38, - "allgather": 0.38, - "sendrecv": 0.35, + "allreduce": 0.45, + "allgather": 0.45, + "reducescatter": 0.45, + "broadcast": 0.40, + "sendrecv": 0.40, + "alltoall": 0.35, } @@ -105,7 +103,7 @@ class NCCLTest: return float(user_override) if nvlink_bw <= 0: return 10.0 # conservative floor - frac = _OP_BW_FRACTIONS.get(label.lower(), 0.40) + frac = _OP_BW_FRACTIONS.get(label.lower(), 0.45) return round(nvlink_bw * frac) if self.gpu_type == "unknown": diff --git a/modules/report.py b/modules/report.py index c8248cd..d9e1eba 100644 --- a/modules/report.py +++ b/modules/report.py @@ -283,7 +283,8 @@ class ReportGenerator: "nvbandwidth unavailable — figure is indicative only, not a true HBM peak)\n" ) else: - verdict = "PASS" if d2d_eff >= 50 else ("WARN" if d2d_eff >= 30 else "FAIL") + # Tightened to match production acceptance: PASS >= 80%, WARN 60–80%, FAIL < 60%. + verdict = "PASS" if d2d_eff >= 80 else ("WARN" if d2d_eff >= 60 else "FAIL") lines.append(f"**Verdict: {verdict}** (D2D efficiency {d2d_eff:.1f}%)\n") # --- Compute Throughput --- @@ -293,9 +294,18 @@ class ReportGenerator: per_dtype = comp_data.get("per_dtype_tflops", {}) peak_tflops = comp_data.get("peak_tflops", {}) eff_pct = comp_data.get("efficiency_pct", {}) - lines.append("| DType | Achieved (TFLOPS) | Peak | Efficiency | Status |") + # Absolute PASS thresholds (TFLOPS) from gpu_specs.compute_pass_thresholds_tflops. + # When present, override the legacy 80%-of-peak rule on a per-dtype basis. + pass_thresholds = comp_data.get("pass_thresholds_tflops", {}) or {} + use_abs = bool(pass_thresholds) + if use_abs: + lines.append("| DType | Achieved (TFLOPS) | Peak | Threshold | Status |") + else: + lines.append("| DType | Achieved (TFLOPS) | Peak | Efficiency | Status |") lines.append("|-------|-------------------|------|------------|--------|") worst_eff = 100.0 + overall_status = "PASS" + rank = {"PASS": 0, "WARN": 1, "FAIL": 2, "SKIP": 0} for dt, val in per_dtype.items(): if isinstance(val, str): # skipped or error @@ -305,11 +315,26 @@ class ReportGenerator: ef = eff_pct.get(dt, 0) if isinstance(ef, (int, float)) and ef > 0: worst_eff = min(worst_eff, ef) - status = "PASS" if ef >= 80 else ("WARN" if ef >= 50 else "FAIL") - lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | {ef:.1f}% | {status} |") + thr = pass_thresholds.get(dt) + if use_abs and thr: + if val >= thr: + status = "PASS" + elif val >= thr * 0.9: + status = "WARN" + else: + status = "FAIL" + lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | >= {thr} | {status} |") + else: + status = "PASS" if ef >= 80 else ("WARN" if ef >= 50 else "FAIL") + lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | {ef:.1f}% | {status} |") + if rank.get(status, 0) > rank.get(overall_status, 0): + overall_status = status lines.append("") - overall = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL") - lines.append(f"**Verdict: {overall}** (worst efficiency {worst_eff:.1f}%)\n") + if use_abs: + lines.append(f"**Verdict: {overall_status}** (absolute TFLOPS thresholds; worst efficiency {worst_eff:.1f}%)\n") + else: + overall_status = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL") + lines.append(f"**Verdict: {overall_status}** (worst efficiency {worst_eff:.1f}%)\n") # --- NCCL --- nccl = results.get("nccl") @@ -449,7 +474,7 @@ class ReportGenerator: items.append(("Memory Bandwidth", f"WARN ({d2d:.0f} GB/s via PyTorch fallback)")) else: eff = mem.get("efficiency_pct") or 0 - verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 50 else "FAIL") + verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 60 else "FAIL") items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)")) # Compute @@ -458,14 +483,43 @@ class ReportGenerator: if comp.get("error"): items.append(("Compute Throughput", f"ERROR: {comp['error']}")) else: + per_dtype = comp.get("per_dtype_tflops", {}) eff_pct = comp.get("efficiency_pct", {}) - valid_effs = [v for v in eff_pct.values() if isinstance(v, (int, float)) and v > 0] - if valid_effs: - worst = min(valid_effs) - verdict = "PASS" if worst >= 80 else ("WARN" if worst >= 50 else "FAIL") - items.append(("Compute Throughput", f"{verdict} (worst {worst:.1f}%)")) + pass_thresholds = comp.get("pass_thresholds_tflops", {}) or {} + if pass_thresholds: + # Absolute TFLOPS judgment, mirroring the per-dtype table above. + rank = {"PASS": 0, "WARN": 1, "FAIL": 2} + worst_status = "PASS" + worst_dt = None + for dt, thr in pass_thresholds.items(): + val = per_dtype.get(dt) + if not isinstance(val, (int, float)): + continue + if val >= thr: + st = "PASS" + elif val >= thr * 0.9: + st = "WARN" + else: + st = "FAIL" + if rank[st] > rank[worst_status]: + worst_status = st + worst_dt = dt + if worst_dt: + items.append(( + "Compute Throughput", + f"{worst_status} (worst {worst_dt.upper()} " + f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})" + )) + else: + items.append(("Compute Throughput", f"{worst_status}")) else: - items.append(("Compute Throughput", "N/A")) + valid_effs = [v for v in eff_pct.values() if isinstance(v, (int, float)) and v > 0] + if valid_effs: + worst = min(valid_effs) + verdict = "PASS" if worst >= 80 else ("WARN" if worst >= 50 else "FAIL") + items.append(("Compute Throughput", f"{verdict} (worst {worst:.1f}%)")) + else: + items.append(("Compute Throughput", "N/A")) # NCCL if "nccl" in results: