feat: 按 H100 生产验收标准更新测试指标与判定逻辑

- gpu_specs: H100 新增 compute_pass_thresholds_tflops 字段
  (fp32:54 / tf32:444 / fp16:734 / bf16:745 / fp8:1400),
  与 marketing peak 解耦,作为绝对 TFLOPS PASS 门槛
- benchmark: compute 结果中透出 pass_thresholds_tflops 供 report 使用
- report: compute 判定改用绝对 TFLOPS (PASS ≥门槛 / WARN ≥门槛×90% /
  FAIL <门槛×90%);表头切换为 Threshold 列;Memory D2D verdict
  由 50/30 收紧至 80/60;无阈值配置的 GPU 保留旧 % 效率逻辑
- nccl: _OP_BW_FRACTIONS 收紧至 AllReduce/AllGather/ReduceScatter
  0.45、Broadcast/SendRecv 0.40、AllToAll 0.35,与验收文档 §5 一致
- configs: benchmark 默认 matrix_size 4096→8192、warmup 10→50、
  iterations 100→500、use_compile 改 true;health temp_warning
  80→75、temp_critical 90→85,匹配生产验收稳态温度要求

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
zulifeng 2026-05-13 14:52:41 +08:00
parent 375d439abb
commit fc97a768cf
5 changed files with 99 additions and 31 deletions

View File

@ -14,14 +14,14 @@ benchmark:
- fp16
- bf16
- fp8
matrix_size: 4096
warmup: 10
iterations: 100
use_compile: false
matrix_size: 8192
warmup: 50
iterations: 500
use_compile: true
health:
temp_warning: 80
temp_critical: 90
temp_warning: 75
temp_critical: 85
power_limit: null # null = auto-detect from GPU TDP per gpu_specs.py
nccl:

View File

@ -469,6 +469,12 @@ class Benchmark:
"per_dtype_tflops": results_by_dtype,
"peak_tflops": {dt: dtype_map[dt][1] for dt in dtype_map},
"efficiency_pct": efficiency,
# Absolute TFLOPS PASS thresholds (decoupled from peak). When present,
# report.py judges PASS/WARN/FAIL against these directly instead of
# using % of peak. Empty dict => fall back to legacy 80% rule.
"pass_thresholds_tflops": dict(
self.specs.get("compute_pass_thresholds_tflops") or {}
),
"per_gpu": per_gpu_results,
"matrix_size": matrix_size,
"warmup": warmup,

View File

@ -20,6 +20,10 @@ GPU_NAME_PATTERNS = {
# Specs database — ALL values are DENSE (non-sparse) TFLOPS
GPU_SPECS = {
"h100": {
# Peaks below are NVIDIA marketing dense peaks (theoretical Tensor Core max).
# `compute_pass_thresholds_tflops` carries the absolute PASS thresholds used
# by report.py — decoupled from peaks so marketing-spec changes (dense vs
# sparse vs FP8-sparsity) don't shift the validation bar.
"full_name": "NVIDIA H100 SXM5",
"architecture": "Hopper",
"compute_capability": 9.0,
@ -31,6 +35,11 @@ GPU_SPECS = {
"fp16_tflops": 990, # dense (1979 sparse w/ 2:4)
"bf16_tflops": 990, # dense
"fp8_tflops": 1979, # dense
"compute_pass_thresholds_tflops": {
"fp32": 54, "tf32": 444, "fp16": 734, "bf16": 745, "fp8": 1400,
# FP64 63 / INT8 1536 — listed for documentation; benchmark module
# doesn't currently exercise these dtypes.
},
"tdp_watts": 700,
"nvlink_gen": 4,
"nvlink_bandwidth_gbps": 900, # bidirectional
@ -171,6 +180,7 @@ _UNKNOWN_SPECS = {
"fp16_tflops": 0,
"bf16_tflops": 0,
"fp8_tflops": 0,
"compute_pass_thresholds_tflops": {}, # empty => report.py falls back to 80% of peak
"tdp_watts": 700,
"nvlink_gen": 0,
"nvlink_bandwidth_gbps": 0,

View File

@ -24,18 +24,16 @@ except ImportError:
# Per-operation bandwidth thresholds, as a fraction of NVLink bidirectional BW.
# AllReduce uses ring algorithm and saturates ring BW; AllToAll requires full-mesh
# transfers and on 8-GPU NVSwitch typically runs 10-20% lower than AllReduce.
# Public H100/H200 8-GPU benchmarks show AllToAll bus BW in the 300-380 GB/s range
# vs AllReduce in 400-500 GB/s. Using a single 40% threshold for both produced
# false positives for AllToAll.
# Values aligned with the H100 production acceptance criteria (acceptance doc §5).
# AllToAll runs ~10-20% lower than AllReduce on 8-GPU NVSwitch, so its fraction is
# set lower; broadcast/sendrecv sit between.
_OP_BW_FRACTIONS = {
"allreduce": 0.40,
"alltoall": 0.30,
"broadcast": 0.35,
"reducescatter": 0.38,
"allgather": 0.38,
"sendrecv": 0.35,
"allreduce": 0.45,
"allgather": 0.45,
"reducescatter": 0.45,
"broadcast": 0.40,
"sendrecv": 0.40,
"alltoall": 0.35,
}
@ -105,7 +103,7 @@ class NCCLTest:
return float(user_override)
if nvlink_bw <= 0:
return 10.0 # conservative floor
frac = _OP_BW_FRACTIONS.get(label.lower(), 0.40)
frac = _OP_BW_FRACTIONS.get(label.lower(), 0.45)
return round(nvlink_bw * frac)
if self.gpu_type == "unknown":

View File

@ -283,7 +283,8 @@ class ReportGenerator:
"nvbandwidth unavailable — figure is indicative only, not a true HBM peak)\n"
)
else:
verdict = "PASS" if d2d_eff >= 50 else ("WARN" if d2d_eff >= 30 else "FAIL")
# Tightened to match production acceptance: PASS >= 80%, WARN 6080%, FAIL < 60%.
verdict = "PASS" if d2d_eff >= 80 else ("WARN" if d2d_eff >= 60 else "FAIL")
lines.append(f"**Verdict: {verdict}** (D2D efficiency {d2d_eff:.1f}%)\n")
# --- Compute Throughput ---
@ -293,9 +294,18 @@ class ReportGenerator:
per_dtype = comp_data.get("per_dtype_tflops", {})
peak_tflops = comp_data.get("peak_tflops", {})
eff_pct = comp_data.get("efficiency_pct", {})
lines.append("| DType | Achieved (TFLOPS) | Peak | Efficiency | Status |")
# Absolute PASS thresholds (TFLOPS) from gpu_specs.compute_pass_thresholds_tflops.
# When present, override the legacy 80%-of-peak rule on a per-dtype basis.
pass_thresholds = comp_data.get("pass_thresholds_tflops", {}) or {}
use_abs = bool(pass_thresholds)
if use_abs:
lines.append("| DType | Achieved (TFLOPS) | Peak | Threshold | Status |")
else:
lines.append("| DType | Achieved (TFLOPS) | Peak | Efficiency | Status |")
lines.append("|-------|-------------------|------|------------|--------|")
worst_eff = 100.0
overall_status = "PASS"
rank = {"PASS": 0, "WARN": 1, "FAIL": 2, "SKIP": 0}
for dt, val in per_dtype.items():
if isinstance(val, str):
# skipped or error
@ -305,11 +315,26 @@ class ReportGenerator:
ef = eff_pct.get(dt, 0)
if isinstance(ef, (int, float)) and ef > 0:
worst_eff = min(worst_eff, ef)
status = "PASS" if ef >= 80 else ("WARN" if ef >= 50 else "FAIL")
lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | {ef:.1f}% | {status} |")
thr = pass_thresholds.get(dt)
if use_abs and thr:
if val >= thr:
status = "PASS"
elif val >= thr * 0.9:
status = "WARN"
else:
status = "FAIL"
lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | >= {thr} | {status} |")
else:
status = "PASS" if ef >= 80 else ("WARN" if ef >= 50 else "FAIL")
lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | {ef:.1f}% | {status} |")
if rank.get(status, 0) > rank.get(overall_status, 0):
overall_status = status
lines.append("")
overall = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL")
lines.append(f"**Verdict: {overall}** (worst efficiency {worst_eff:.1f}%)\n")
if use_abs:
lines.append(f"**Verdict: {overall_status}** (absolute TFLOPS thresholds; worst efficiency {worst_eff:.1f}%)\n")
else:
overall_status = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL")
lines.append(f"**Verdict: {overall_status}** (worst efficiency {worst_eff:.1f}%)\n")
# --- NCCL ---
nccl = results.get("nccl")
@ -449,7 +474,7 @@ class ReportGenerator:
items.append(("Memory Bandwidth", f"WARN ({d2d:.0f} GB/s via PyTorch fallback)"))
else:
eff = mem.get("efficiency_pct") or 0
verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 50 else "FAIL")
verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 60 else "FAIL")
items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))
# Compute
@ -458,14 +483,43 @@ class ReportGenerator:
if comp.get("error"):
items.append(("Compute Throughput", f"ERROR: {comp['error']}"))
else:
per_dtype = comp.get("per_dtype_tflops", {})
eff_pct = comp.get("efficiency_pct", {})
valid_effs = [v for v in eff_pct.values() if isinstance(v, (int, float)) and v > 0]
if valid_effs:
worst = min(valid_effs)
verdict = "PASS" if worst >= 80 else ("WARN" if worst >= 50 else "FAIL")
items.append(("Compute Throughput", f"{verdict} (worst {worst:.1f}%)"))
pass_thresholds = comp.get("pass_thresholds_tflops", {}) or {}
if pass_thresholds:
# Absolute TFLOPS judgment, mirroring the per-dtype table above.
rank = {"PASS": 0, "WARN": 1, "FAIL": 2}
worst_status = "PASS"
worst_dt = None
for dt, thr in pass_thresholds.items():
val = per_dtype.get(dt)
if not isinstance(val, (int, float)):
continue
if val >= thr:
st = "PASS"
elif val >= thr * 0.9:
st = "WARN"
else:
st = "FAIL"
if rank[st] > rank[worst_status]:
worst_status = st
worst_dt = dt
if worst_dt:
items.append((
"Compute Throughput",
f"{worst_status} (worst {worst_dt.upper()} "
f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})"
))
else:
items.append(("Compute Throughput", f"{worst_status}"))
else:
items.append(("Compute Throughput", "N/A"))
valid_effs = [v for v in eff_pct.values() if isinstance(v, (int, float)) and v > 0]
if valid_effs:
worst = min(valid_effs)
verdict = "PASS" if worst >= 80 else ("WARN" if worst >= 50 else "FAIL")
items.append(("Compute Throughput", f"{verdict} (worst {worst:.1f}%)"))
else:
items.append(("Compute Throughput", "N/A"))
# NCCL
if "nccl" in results: