feat: 按 H100 生产验收标准更新测试指标与判定逻辑
- gpu_specs: H100 新增 compute_pass_thresholds_tflops 字段 (fp32:54 / tf32:444 / fp16:734 / bf16:745 / fp8:1400), 与 marketing peak 解耦,作为绝对 TFLOPS PASS 门槛 - benchmark: compute 结果中透出 pass_thresholds_tflops 供 report 使用 - report: compute 判定改用绝对 TFLOPS (PASS ≥门槛 / WARN ≥门槛×90% / FAIL <门槛×90%);表头切换为 Threshold 列;Memory D2D verdict 由 50/30 收紧至 80/60;无阈值配置的 GPU 保留旧 % 效率逻辑 - nccl: _OP_BW_FRACTIONS 收紧至 AllReduce/AllGather/ReduceScatter 0.45、Broadcast/SendRecv 0.40、AllToAll 0.35,与验收文档 §5 一致 - configs: benchmark 默认 matrix_size 4096→8192、warmup 10→50、 iterations 100→500、use_compile 改 true;health temp_warning 80→75、temp_critical 90→85,匹配生产验收稳态温度要求 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
375d439abb
commit
fc97a768cf
@ -14,14 +14,14 @@ benchmark:
|
|||||||
- fp16
|
- fp16
|
||||||
- bf16
|
- bf16
|
||||||
- fp8
|
- fp8
|
||||||
matrix_size: 4096
|
matrix_size: 8192
|
||||||
warmup: 10
|
warmup: 50
|
||||||
iterations: 100
|
iterations: 500
|
||||||
use_compile: false
|
use_compile: true
|
||||||
|
|
||||||
health:
|
health:
|
||||||
temp_warning: 80
|
temp_warning: 75
|
||||||
temp_critical: 90
|
temp_critical: 85
|
||||||
power_limit: null # null = auto-detect from GPU TDP per gpu_specs.py
|
power_limit: null # null = auto-detect from GPU TDP per gpu_specs.py
|
||||||
|
|
||||||
nccl:
|
nccl:
|
||||||
|
|||||||
@ -469,6 +469,12 @@ class Benchmark:
|
|||||||
"per_dtype_tflops": results_by_dtype,
|
"per_dtype_tflops": results_by_dtype,
|
||||||
"peak_tflops": {dt: dtype_map[dt][1] for dt in dtype_map},
|
"peak_tflops": {dt: dtype_map[dt][1] for dt in dtype_map},
|
||||||
"efficiency_pct": efficiency,
|
"efficiency_pct": efficiency,
|
||||||
|
# Absolute TFLOPS PASS thresholds (decoupled from peak). When present,
|
||||||
|
# report.py judges PASS/WARN/FAIL against these directly instead of
|
||||||
|
# using % of peak. Empty dict => fall back to legacy 80% rule.
|
||||||
|
"pass_thresholds_tflops": dict(
|
||||||
|
self.specs.get("compute_pass_thresholds_tflops") or {}
|
||||||
|
),
|
||||||
"per_gpu": per_gpu_results,
|
"per_gpu": per_gpu_results,
|
||||||
"matrix_size": matrix_size,
|
"matrix_size": matrix_size,
|
||||||
"warmup": warmup,
|
"warmup": warmup,
|
||||||
|
|||||||
@ -20,6 +20,10 @@ GPU_NAME_PATTERNS = {
|
|||||||
# Specs database — ALL values are DENSE (non-sparse) TFLOPS
|
# Specs database — ALL values are DENSE (non-sparse) TFLOPS
|
||||||
GPU_SPECS = {
|
GPU_SPECS = {
|
||||||
"h100": {
|
"h100": {
|
||||||
|
# Peaks below are NVIDIA marketing dense peaks (theoretical Tensor Core max).
|
||||||
|
# `compute_pass_thresholds_tflops` carries the absolute PASS thresholds used
|
||||||
|
# by report.py — decoupled from peaks so marketing-spec changes (dense vs
|
||||||
|
# sparse vs FP8-sparsity) don't shift the validation bar.
|
||||||
"full_name": "NVIDIA H100 SXM5",
|
"full_name": "NVIDIA H100 SXM5",
|
||||||
"architecture": "Hopper",
|
"architecture": "Hopper",
|
||||||
"compute_capability": 9.0,
|
"compute_capability": 9.0,
|
||||||
@ -31,6 +35,11 @@ GPU_SPECS = {
|
|||||||
"fp16_tflops": 990, # dense (1979 sparse w/ 2:4)
|
"fp16_tflops": 990, # dense (1979 sparse w/ 2:4)
|
||||||
"bf16_tflops": 990, # dense
|
"bf16_tflops": 990, # dense
|
||||||
"fp8_tflops": 1979, # dense
|
"fp8_tflops": 1979, # dense
|
||||||
|
"compute_pass_thresholds_tflops": {
|
||||||
|
"fp32": 54, "tf32": 444, "fp16": 734, "bf16": 745, "fp8": 1400,
|
||||||
|
# FP64 63 / INT8 1536 — listed for documentation; benchmark module
|
||||||
|
# doesn't currently exercise these dtypes.
|
||||||
|
},
|
||||||
"tdp_watts": 700,
|
"tdp_watts": 700,
|
||||||
"nvlink_gen": 4,
|
"nvlink_gen": 4,
|
||||||
"nvlink_bandwidth_gbps": 900, # bidirectional
|
"nvlink_bandwidth_gbps": 900, # bidirectional
|
||||||
@ -171,6 +180,7 @@ _UNKNOWN_SPECS = {
|
|||||||
"fp16_tflops": 0,
|
"fp16_tflops": 0,
|
||||||
"bf16_tflops": 0,
|
"bf16_tflops": 0,
|
||||||
"fp8_tflops": 0,
|
"fp8_tflops": 0,
|
||||||
|
"compute_pass_thresholds_tflops": {}, # empty => report.py falls back to 80% of peak
|
||||||
"tdp_watts": 700,
|
"tdp_watts": 700,
|
||||||
"nvlink_gen": 0,
|
"nvlink_gen": 0,
|
||||||
"nvlink_bandwidth_gbps": 0,
|
"nvlink_bandwidth_gbps": 0,
|
||||||
|
|||||||
@ -24,18 +24,16 @@ except ImportError:
|
|||||||
|
|
||||||
|
|
||||||
# Per-operation bandwidth thresholds, as a fraction of NVLink bidirectional BW.
|
# Per-operation bandwidth thresholds, as a fraction of NVLink bidirectional BW.
|
||||||
# AllReduce uses ring algorithm and saturates ring BW; AllToAll requires full-mesh
|
# Values aligned with the H100 production acceptance criteria (acceptance doc §5).
|
||||||
# transfers and on 8-GPU NVSwitch typically runs 10-20% lower than AllReduce.
|
# AllToAll runs ~10-20% lower than AllReduce on 8-GPU NVSwitch, so its fraction is
|
||||||
# Public H100/H200 8-GPU benchmarks show AllToAll bus BW in the 300-380 GB/s range
|
# set lower; broadcast/sendrecv sit between.
|
||||||
# vs AllReduce in 400-500 GB/s. Using a single 40% threshold for both produced
|
|
||||||
# false positives for AllToAll.
|
|
||||||
_OP_BW_FRACTIONS = {
|
_OP_BW_FRACTIONS = {
|
||||||
"allreduce": 0.40,
|
"allreduce": 0.45,
|
||||||
"alltoall": 0.30,
|
"allgather": 0.45,
|
||||||
"broadcast": 0.35,
|
"reducescatter": 0.45,
|
||||||
"reducescatter": 0.38,
|
"broadcast": 0.40,
|
||||||
"allgather": 0.38,
|
"sendrecv": 0.40,
|
||||||
"sendrecv": 0.35,
|
"alltoall": 0.35,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -105,7 +103,7 @@ class NCCLTest:
|
|||||||
return float(user_override)
|
return float(user_override)
|
||||||
if nvlink_bw <= 0:
|
if nvlink_bw <= 0:
|
||||||
return 10.0 # conservative floor
|
return 10.0 # conservative floor
|
||||||
frac = _OP_BW_FRACTIONS.get(label.lower(), 0.40)
|
frac = _OP_BW_FRACTIONS.get(label.lower(), 0.45)
|
||||||
return round(nvlink_bw * frac)
|
return round(nvlink_bw * frac)
|
||||||
|
|
||||||
if self.gpu_type == "unknown":
|
if self.gpu_type == "unknown":
|
||||||
|
|||||||
@ -283,7 +283,8 @@ class ReportGenerator:
|
|||||||
"nvbandwidth unavailable — figure is indicative only, not a true HBM peak)\n"
|
"nvbandwidth unavailable — figure is indicative only, not a true HBM peak)\n"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
verdict = "PASS" if d2d_eff >= 50 else ("WARN" if d2d_eff >= 30 else "FAIL")
|
# Tightened to match production acceptance: PASS >= 80%, WARN 60–80%, FAIL < 60%.
|
||||||
|
verdict = "PASS" if d2d_eff >= 80 else ("WARN" if d2d_eff >= 60 else "FAIL")
|
||||||
lines.append(f"**Verdict: {verdict}** (D2D efficiency {d2d_eff:.1f}%)\n")
|
lines.append(f"**Verdict: {verdict}** (D2D efficiency {d2d_eff:.1f}%)\n")
|
||||||
|
|
||||||
# --- Compute Throughput ---
|
# --- Compute Throughput ---
|
||||||
@ -293,9 +294,18 @@ class ReportGenerator:
|
|||||||
per_dtype = comp_data.get("per_dtype_tflops", {})
|
per_dtype = comp_data.get("per_dtype_tflops", {})
|
||||||
peak_tflops = comp_data.get("peak_tflops", {})
|
peak_tflops = comp_data.get("peak_tflops", {})
|
||||||
eff_pct = comp_data.get("efficiency_pct", {})
|
eff_pct = comp_data.get("efficiency_pct", {})
|
||||||
lines.append("| DType | Achieved (TFLOPS) | Peak | Efficiency | Status |")
|
# Absolute PASS thresholds (TFLOPS) from gpu_specs.compute_pass_thresholds_tflops.
|
||||||
|
# When present, override the legacy 80%-of-peak rule on a per-dtype basis.
|
||||||
|
pass_thresholds = comp_data.get("pass_thresholds_tflops", {}) or {}
|
||||||
|
use_abs = bool(pass_thresholds)
|
||||||
|
if use_abs:
|
||||||
|
lines.append("| DType | Achieved (TFLOPS) | Peak | Threshold | Status |")
|
||||||
|
else:
|
||||||
|
lines.append("| DType | Achieved (TFLOPS) | Peak | Efficiency | Status |")
|
||||||
lines.append("|-------|-------------------|------|------------|--------|")
|
lines.append("|-------|-------------------|------|------------|--------|")
|
||||||
worst_eff = 100.0
|
worst_eff = 100.0
|
||||||
|
overall_status = "PASS"
|
||||||
|
rank = {"PASS": 0, "WARN": 1, "FAIL": 2, "SKIP": 0}
|
||||||
for dt, val in per_dtype.items():
|
for dt, val in per_dtype.items():
|
||||||
if isinstance(val, str):
|
if isinstance(val, str):
|
||||||
# skipped or error
|
# skipped or error
|
||||||
@ -305,11 +315,26 @@ class ReportGenerator:
|
|||||||
ef = eff_pct.get(dt, 0)
|
ef = eff_pct.get(dt, 0)
|
||||||
if isinstance(ef, (int, float)) and ef > 0:
|
if isinstance(ef, (int, float)) and ef > 0:
|
||||||
worst_eff = min(worst_eff, ef)
|
worst_eff = min(worst_eff, ef)
|
||||||
status = "PASS" if ef >= 80 else ("WARN" if ef >= 50 else "FAIL")
|
thr = pass_thresholds.get(dt)
|
||||||
lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | {ef:.1f}% | {status} |")
|
if use_abs and thr:
|
||||||
|
if val >= thr:
|
||||||
|
status = "PASS"
|
||||||
|
elif val >= thr * 0.9:
|
||||||
|
status = "WARN"
|
||||||
|
else:
|
||||||
|
status = "FAIL"
|
||||||
|
lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | >= {thr} | {status} |")
|
||||||
|
else:
|
||||||
|
status = "PASS" if ef >= 80 else ("WARN" if ef >= 50 else "FAIL")
|
||||||
|
lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | {ef:.1f}% | {status} |")
|
||||||
|
if rank.get(status, 0) > rank.get(overall_status, 0):
|
||||||
|
overall_status = status
|
||||||
lines.append("")
|
lines.append("")
|
||||||
overall = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL")
|
if use_abs:
|
||||||
lines.append(f"**Verdict: {overall}** (worst efficiency {worst_eff:.1f}%)\n")
|
lines.append(f"**Verdict: {overall_status}** (absolute TFLOPS thresholds; worst efficiency {worst_eff:.1f}%)\n")
|
||||||
|
else:
|
||||||
|
overall_status = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL")
|
||||||
|
lines.append(f"**Verdict: {overall_status}** (worst efficiency {worst_eff:.1f}%)\n")
|
||||||
|
|
||||||
# --- NCCL ---
|
# --- NCCL ---
|
||||||
nccl = results.get("nccl")
|
nccl = results.get("nccl")
|
||||||
@ -449,7 +474,7 @@ class ReportGenerator:
|
|||||||
items.append(("Memory Bandwidth", f"WARN ({d2d:.0f} GB/s via PyTorch fallback)"))
|
items.append(("Memory Bandwidth", f"WARN ({d2d:.0f} GB/s via PyTorch fallback)"))
|
||||||
else:
|
else:
|
||||||
eff = mem.get("efficiency_pct") or 0
|
eff = mem.get("efficiency_pct") or 0
|
||||||
verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 50 else "FAIL")
|
verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 60 else "FAIL")
|
||||||
items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))
|
items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))
|
||||||
|
|
||||||
# Compute
|
# Compute
|
||||||
@ -458,14 +483,43 @@ class ReportGenerator:
|
|||||||
if comp.get("error"):
|
if comp.get("error"):
|
||||||
items.append(("Compute Throughput", f"ERROR: {comp['error']}"))
|
items.append(("Compute Throughput", f"ERROR: {comp['error']}"))
|
||||||
else:
|
else:
|
||||||
|
per_dtype = comp.get("per_dtype_tflops", {})
|
||||||
eff_pct = comp.get("efficiency_pct", {})
|
eff_pct = comp.get("efficiency_pct", {})
|
||||||
valid_effs = [v for v in eff_pct.values() if isinstance(v, (int, float)) and v > 0]
|
pass_thresholds = comp.get("pass_thresholds_tflops", {}) or {}
|
||||||
if valid_effs:
|
if pass_thresholds:
|
||||||
worst = min(valid_effs)
|
# Absolute TFLOPS judgment, mirroring the per-dtype table above.
|
||||||
verdict = "PASS" if worst >= 80 else ("WARN" if worst >= 50 else "FAIL")
|
rank = {"PASS": 0, "WARN": 1, "FAIL": 2}
|
||||||
items.append(("Compute Throughput", f"{verdict} (worst {worst:.1f}%)"))
|
worst_status = "PASS"
|
||||||
|
worst_dt = None
|
||||||
|
for dt, thr in pass_thresholds.items():
|
||||||
|
val = per_dtype.get(dt)
|
||||||
|
if not isinstance(val, (int, float)):
|
||||||
|
continue
|
||||||
|
if val >= thr:
|
||||||
|
st = "PASS"
|
||||||
|
elif val >= thr * 0.9:
|
||||||
|
st = "WARN"
|
||||||
|
else:
|
||||||
|
st = "FAIL"
|
||||||
|
if rank[st] > rank[worst_status]:
|
||||||
|
worst_status = st
|
||||||
|
worst_dt = dt
|
||||||
|
if worst_dt:
|
||||||
|
items.append((
|
||||||
|
"Compute Throughput",
|
||||||
|
f"{worst_status} (worst {worst_dt.upper()} "
|
||||||
|
f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})"
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
items.append(("Compute Throughput", f"{worst_status}"))
|
||||||
else:
|
else:
|
||||||
items.append(("Compute Throughput", "N/A"))
|
valid_effs = [v for v in eff_pct.values() if isinstance(v, (int, float)) and v > 0]
|
||||||
|
if valid_effs:
|
||||||
|
worst = min(valid_effs)
|
||||||
|
verdict = "PASS" if worst >= 80 else ("WARN" if worst >= 50 else "FAIL")
|
||||||
|
items.append(("Compute Throughput", f"{verdict} (worst {worst:.1f}%)"))
|
||||||
|
else:
|
||||||
|
items.append(("Compute Throughput", "N/A"))
|
||||||
|
|
||||||
# NCCL
|
# NCCL
|
||||||
if "nccl" in results:
|
if "nccl" in results:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user