feat: 按 H100 生产验收标准更新测试指标与判定逻辑
- gpu_specs: H100 新增 compute_pass_thresholds_tflops 字段 (fp32:54 / tf32:444 / fp16:734 / bf16:745 / fp8:1400), 与 marketing peak 解耦,作为绝对 TFLOPS PASS 门槛 - benchmark: compute 结果中透出 pass_thresholds_tflops 供 report 使用 - report: compute 判定改用绝对 TFLOPS (PASS ≥门槛 / WARN ≥门槛×90% / FAIL <门槛×90%);表头切换为 Threshold 列;Memory D2D verdict 由 50/30 收紧至 80/60;无阈值配置的 GPU 保留旧 % 效率逻辑 - nccl: _OP_BW_FRACTIONS 收紧至 AllReduce/AllGather/ReduceScatter 0.45、Broadcast/SendRecv 0.40、AllToAll 0.35,与验收文档 §5 一致 - configs: benchmark 默认 matrix_size 4096→8192、warmup 10→50、 iterations 100→500、use_compile 改 true;health temp_warning 80→75、temp_critical 90→85,匹配生产验收稳态温度要求 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
375d439abb
commit
fc97a768cf
@ -14,14 +14,14 @@ benchmark:
|
||||
- fp16
|
||||
- bf16
|
||||
- fp8
|
||||
matrix_size: 4096
|
||||
warmup: 10
|
||||
iterations: 100
|
||||
use_compile: false
|
||||
matrix_size: 8192
|
||||
warmup: 50
|
||||
iterations: 500
|
||||
use_compile: true
|
||||
|
||||
health:
|
||||
temp_warning: 80
|
||||
temp_critical: 90
|
||||
temp_warning: 75
|
||||
temp_critical: 85
|
||||
power_limit: null # null = auto-detect from GPU TDP per gpu_specs.py
|
||||
|
||||
nccl:
|
||||
|
||||
@ -469,6 +469,12 @@ class Benchmark:
|
||||
"per_dtype_tflops": results_by_dtype,
|
||||
"peak_tflops": {dt: dtype_map[dt][1] for dt in dtype_map},
|
||||
"efficiency_pct": efficiency,
|
||||
# Absolute TFLOPS PASS thresholds (decoupled from peak). When present,
|
||||
# report.py judges PASS/WARN/FAIL against these directly instead of
|
||||
# using % of peak. Empty dict => fall back to legacy 80% rule.
|
||||
"pass_thresholds_tflops": dict(
|
||||
self.specs.get("compute_pass_thresholds_tflops") or {}
|
||||
),
|
||||
"per_gpu": per_gpu_results,
|
||||
"matrix_size": matrix_size,
|
||||
"warmup": warmup,
|
||||
|
||||
@ -20,6 +20,10 @@ GPU_NAME_PATTERNS = {
|
||||
# Specs database — ALL values are DENSE (non-sparse) TFLOPS
|
||||
GPU_SPECS = {
|
||||
"h100": {
|
||||
# Peaks below are NVIDIA marketing dense peaks (theoretical Tensor Core max).
|
||||
# `compute_pass_thresholds_tflops` carries the absolute PASS thresholds used
|
||||
# by report.py — decoupled from peaks so marketing-spec changes (dense vs
|
||||
# sparse vs FP8-sparsity) don't shift the validation bar.
|
||||
"full_name": "NVIDIA H100 SXM5",
|
||||
"architecture": "Hopper",
|
||||
"compute_capability": 9.0,
|
||||
@ -31,6 +35,11 @@ GPU_SPECS = {
|
||||
"fp16_tflops": 990, # dense (1979 sparse w/ 2:4)
|
||||
"bf16_tflops": 990, # dense
|
||||
"fp8_tflops": 1979, # dense
|
||||
"compute_pass_thresholds_tflops": {
|
||||
"fp32": 54, "tf32": 444, "fp16": 734, "bf16": 745, "fp8": 1400,
|
||||
# FP64 63 / INT8 1536 — listed for documentation; benchmark module
|
||||
# doesn't currently exercise these dtypes.
|
||||
},
|
||||
"tdp_watts": 700,
|
||||
"nvlink_gen": 4,
|
||||
"nvlink_bandwidth_gbps": 900, # bidirectional
|
||||
@ -171,6 +180,7 @@ _UNKNOWN_SPECS = {
|
||||
"fp16_tflops": 0,
|
||||
"bf16_tflops": 0,
|
||||
"fp8_tflops": 0,
|
||||
"compute_pass_thresholds_tflops": {}, # empty => report.py falls back to 80% of peak
|
||||
"tdp_watts": 700,
|
||||
"nvlink_gen": 0,
|
||||
"nvlink_bandwidth_gbps": 0,
|
||||
|
||||
@ -24,18 +24,16 @@ except ImportError:
|
||||
|
||||
|
||||
# Per-operation bandwidth thresholds, as a fraction of NVLink bidirectional BW.
|
||||
# AllReduce uses ring algorithm and saturates ring BW; AllToAll requires full-mesh
|
||||
# transfers and on 8-GPU NVSwitch typically runs 10-20% lower than AllReduce.
|
||||
# Public H100/H200 8-GPU benchmarks show AllToAll bus BW in the 300-380 GB/s range
|
||||
# vs AllReduce in 400-500 GB/s. Using a single 40% threshold for both produced
|
||||
# false positives for AllToAll.
|
||||
# Values aligned with the H100 production acceptance criteria (acceptance doc §5).
|
||||
# AllToAll runs ~10-20% lower than AllReduce on 8-GPU NVSwitch, so its fraction is
|
||||
# set lower; broadcast/sendrecv sit between.
|
||||
_OP_BW_FRACTIONS = {
|
||||
"allreduce": 0.40,
|
||||
"alltoall": 0.30,
|
||||
"broadcast": 0.35,
|
||||
"reducescatter": 0.38,
|
||||
"allgather": 0.38,
|
||||
"sendrecv": 0.35,
|
||||
"allreduce": 0.45,
|
||||
"allgather": 0.45,
|
||||
"reducescatter": 0.45,
|
||||
"broadcast": 0.40,
|
||||
"sendrecv": 0.40,
|
||||
"alltoall": 0.35,
|
||||
}
|
||||
|
||||
|
||||
@ -105,7 +103,7 @@ class NCCLTest:
|
||||
return float(user_override)
|
||||
if nvlink_bw <= 0:
|
||||
return 10.0 # conservative floor
|
||||
frac = _OP_BW_FRACTIONS.get(label.lower(), 0.40)
|
||||
frac = _OP_BW_FRACTIONS.get(label.lower(), 0.45)
|
||||
return round(nvlink_bw * frac)
|
||||
|
||||
if self.gpu_type == "unknown":
|
||||
|
||||
@ -283,7 +283,8 @@ class ReportGenerator:
|
||||
"nvbandwidth unavailable — figure is indicative only, not a true HBM peak)\n"
|
||||
)
|
||||
else:
|
||||
verdict = "PASS" if d2d_eff >= 50 else ("WARN" if d2d_eff >= 30 else "FAIL")
|
||||
# Tightened to match production acceptance: PASS >= 80%, WARN 60–80%, FAIL < 60%.
|
||||
verdict = "PASS" if d2d_eff >= 80 else ("WARN" if d2d_eff >= 60 else "FAIL")
|
||||
lines.append(f"**Verdict: {verdict}** (D2D efficiency {d2d_eff:.1f}%)\n")
|
||||
|
||||
# --- Compute Throughput ---
|
||||
@ -293,9 +294,18 @@ class ReportGenerator:
|
||||
per_dtype = comp_data.get("per_dtype_tflops", {})
|
||||
peak_tflops = comp_data.get("peak_tflops", {})
|
||||
eff_pct = comp_data.get("efficiency_pct", {})
|
||||
# Absolute PASS thresholds (TFLOPS) from gpu_specs.compute_pass_thresholds_tflops.
|
||||
# When present, override the legacy 80%-of-peak rule on a per-dtype basis.
|
||||
pass_thresholds = comp_data.get("pass_thresholds_tflops", {}) or {}
|
||||
use_abs = bool(pass_thresholds)
|
||||
if use_abs:
|
||||
lines.append("| DType | Achieved (TFLOPS) | Peak | Threshold | Status |")
|
||||
else:
|
||||
lines.append("| DType | Achieved (TFLOPS) | Peak | Efficiency | Status |")
|
||||
lines.append("|-------|-------------------|------|------------|--------|")
|
||||
worst_eff = 100.0
|
||||
overall_status = "PASS"
|
||||
rank = {"PASS": 0, "WARN": 1, "FAIL": 2, "SKIP": 0}
|
||||
for dt, val in per_dtype.items():
|
||||
if isinstance(val, str):
|
||||
# skipped or error
|
||||
@ -305,11 +315,26 @@ class ReportGenerator:
|
||||
ef = eff_pct.get(dt, 0)
|
||||
if isinstance(ef, (int, float)) and ef > 0:
|
||||
worst_eff = min(worst_eff, ef)
|
||||
thr = pass_thresholds.get(dt)
|
||||
if use_abs and thr:
|
||||
if val >= thr:
|
||||
status = "PASS"
|
||||
elif val >= thr * 0.9:
|
||||
status = "WARN"
|
||||
else:
|
||||
status = "FAIL"
|
||||
lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | >= {thr} | {status} |")
|
||||
else:
|
||||
status = "PASS" if ef >= 80 else ("WARN" if ef >= 50 else "FAIL")
|
||||
lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | {ef:.1f}% | {status} |")
|
||||
if rank.get(status, 0) > rank.get(overall_status, 0):
|
||||
overall_status = status
|
||||
lines.append("")
|
||||
overall = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL")
|
||||
lines.append(f"**Verdict: {overall}** (worst efficiency {worst_eff:.1f}%)\n")
|
||||
if use_abs:
|
||||
lines.append(f"**Verdict: {overall_status}** (absolute TFLOPS thresholds; worst efficiency {worst_eff:.1f}%)\n")
|
||||
else:
|
||||
overall_status = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL")
|
||||
lines.append(f"**Verdict: {overall_status}** (worst efficiency {worst_eff:.1f}%)\n")
|
||||
|
||||
# --- NCCL ---
|
||||
nccl = results.get("nccl")
|
||||
@ -449,7 +474,7 @@ class ReportGenerator:
|
||||
items.append(("Memory Bandwidth", f"WARN ({d2d:.0f} GB/s via PyTorch fallback)"))
|
||||
else:
|
||||
eff = mem.get("efficiency_pct") or 0
|
||||
verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 50 else "FAIL")
|
||||
verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 60 else "FAIL")
|
||||
items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))
|
||||
|
||||
# Compute
|
||||
@ -458,7 +483,36 @@ class ReportGenerator:
|
||||
if comp.get("error"):
|
||||
items.append(("Compute Throughput", f"ERROR: {comp['error']}"))
|
||||
else:
|
||||
per_dtype = comp.get("per_dtype_tflops", {})
|
||||
eff_pct = comp.get("efficiency_pct", {})
|
||||
pass_thresholds = comp.get("pass_thresholds_tflops", {}) or {}
|
||||
if pass_thresholds:
|
||||
# Absolute TFLOPS judgment, mirroring the per-dtype table above.
|
||||
rank = {"PASS": 0, "WARN": 1, "FAIL": 2}
|
||||
worst_status = "PASS"
|
||||
worst_dt = None
|
||||
for dt, thr in pass_thresholds.items():
|
||||
val = per_dtype.get(dt)
|
||||
if not isinstance(val, (int, float)):
|
||||
continue
|
||||
if val >= thr:
|
||||
st = "PASS"
|
||||
elif val >= thr * 0.9:
|
||||
st = "WARN"
|
||||
else:
|
||||
st = "FAIL"
|
||||
if rank[st] > rank[worst_status]:
|
||||
worst_status = st
|
||||
worst_dt = dt
|
||||
if worst_dt:
|
||||
items.append((
|
||||
"Compute Throughput",
|
||||
f"{worst_status} (worst {worst_dt.upper()} "
|
||||
f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})"
|
||||
))
|
||||
else:
|
||||
items.append(("Compute Throughput", f"{worst_status}"))
|
||||
else:
|
||||
valid_effs = [v for v in eff_pct.values() if isinstance(v, (int, float)) and v > 0]
|
||||
if valid_effs:
|
||||
worst = min(valid_effs)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user