fix: resolve FP8 benchmark, NCCL parsing, and report None-value bugs

- benchmark.py: FP8 dtype now uses torch._scaled_mm() with scale tensors
  instead of torch.matmul() which does not support float8_e4m3fn on Hopper;
  fixes "addmm_cuda not implemented" error and enables FP8 TFLOPS measurement

- nccl_test.py: fix two bugs causing all-zero bandwidth results
  1. buffer size changed from -b 8 (8 bytes) to -b 8M -e 8G for meaningful load
  2. column parser corrected: parts[2] is dtype string not time value;
     now reads time=parts[5], algbw=parts[6], busbw=parts[7] per nccl-tests format

- report.py: replace .get(key, 0) with .get(key) or 0 at all bandwidth/stress
  fields to handle None values stored in result dicts (dict.get with default
  does not override an explicitly stored None)

- .gitignore: exclude .claude/settings.local.json

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
zulifeng 2026-05-10 15:53:12 +08:00
parent 09f81973bc
commit ef2ca11c58
4 changed files with 49 additions and 31 deletions

1
.gitignore vendored
View File

@ -14,3 +14,4 @@ reports/
.venv/ .venv/
venv/ venv/
.qoder/* .qoder/*
.claude/settings.local.json

View File

@ -349,15 +349,29 @@ class Benchmark:
if dtype_name == "fp8": if dtype_name == "fp8":
a = torch.randn(M, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn) a = torch.randn(M, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn)
b = torch.randn(K, N, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn) b = torch.randn(N, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn)
scale_a = torch.tensor(1.0, device="cuda")
scale_b = torch.tensor(1.0, device="cuda")
def _fp8_mm():
return torch._scaled_mm(a, b.T, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.bfloat16)
else: else:
a = torch.randn(M, K, device="cuda", dtype=dtype_val) a = torch.randn(M, K, device="cuda", dtype=dtype_val)
b = torch.randn(K, N, device="cuda", dtype=dtype_val) b = torch.randn(K, N, device="cuda", dtype=dtype_val)
if dtype_name == "fp8":
for _ in range(warmup):
_fp8_mm()
torch.cuda.synchronize()
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
for _ in range(iterations):
c = _fp8_mm()
end_event.record()
else:
for _ in range(warmup): for _ in range(warmup):
torch.matmul(a, b) torch.matmul(a, b)
torch.cuda.synchronize() torch.cuda.synchronize()
start_event = torch.cuda.Event(enable_timing=True) start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True)
start_event.record() start_event.record()

View File

@ -150,8 +150,8 @@ class NCCLTest:
cmd = [ cmd = [
binary, binary,
"-b", "8", "-b", "8M",
"-e", "256M", "-e", "8G",
"-f", "2", "-f", "2",
"-g", str(gpu_count), "-g", str(gpu_count),
"-w", "5", "-w", "5",
@ -239,12 +239,15 @@ class NCCLTest:
if not line or line.startswith("#"): if not line or line.startswith("#"):
continue continue
parts = line.split() parts = line.split()
if len(parts) >= 7: # nccl-tests data lines: size count type redop root time algbw busbw #wrong [time algbw busbw #wrong]
if len(parts) >= 9:
try: try:
size = int(parts[0]) size = int(parts[0])
algbw = float(parts[-3]) if len(parts) >= 3 else 0 # parts[2] is dtype string ('float'/'int32'/etc.), not a number
busbw = float(parts[-2]) if len(parts) >= 2 else 0 # out-of-place columns: time=parts[5], algbw=parts[6], busbw=parts[7]
time_us = float(parts[2]) if len(parts) >= 3 else 0 time_us = float(parts[5])
algbw = float(parts[6])
busbw = float(parts[7])
size_results.append({ size_results.append({
"size": size, "size": size,
"time_us": time_us, "time_us": time_us,

View File

@ -256,20 +256,20 @@ class ReportGenerator:
lines.append(f"Source: {mem_data.get('source', 'unknown')}\n") lines.append(f"Source: {mem_data.get('source', 'unknown')}\n")
lines.append("| Metric | Value | Peak | Efficiency |") lines.append("| Metric | Value | Peak | Efficiency |")
lines.append("|--------|-------|------|------------|") lines.append("|--------|-------|------|------------|")
d2d = mem_data.get("d2d_bandwidth_gbps", 0) d2d = mem_data.get("d2d_bandwidth_gbps") or 0
h2d = mem_data.get("h2d_bandwidth_gbps", 0) h2d = mem_data.get("h2d_bandwidth_gbps") or 0
d2h = mem_data.get("d2h_bandwidth_gbps", 0) d2h = mem_data.get("d2h_bandwidth_gbps") or 0
# New format with per-metric peaks # New format with per-metric peaks
h2d_peak = mem_data.get("h2d_peak_gbps", 0) h2d_peak = mem_data.get("h2d_peak_gbps") or 0
d2h_peak = mem_data.get("d2h_peak_gbps", 0) d2h_peak = mem_data.get("d2h_peak_gbps") or 0
d2d_peak = mem_data.get("d2d_peak_gbps", 0) d2d_peak = mem_data.get("d2d_peak_gbps") or 0
h2d_eff = mem_data.get("h2d_efficiency_pct", 0) h2d_eff = mem_data.get("h2d_efficiency_pct") or 0
d2h_eff = mem_data.get("d2h_efficiency_pct", 0) d2h_eff = mem_data.get("d2h_efficiency_pct") or 0
d2d_eff = mem_data.get("d2d_efficiency_pct", 0) d2d_eff = mem_data.get("d2d_efficiency_pct") or 0
# Fallback for old format # Fallback for old format
if not d2d_peak: if not d2d_peak:
d2d_peak = mem_data.get("peak_bandwidth_gbps", 0) d2d_peak = mem_data.get("peak_bandwidth_gbps") or 0
d2d_eff = mem_data.get("efficiency_pct", 0) d2d_eff = mem_data.get("efficiency_pct") or 0
lines.append(f"| H2D (PCIe) | {h2d:.1f} GB/s | {h2d_peak:.0f} GB/s | {h2d_eff:.1f}% |") lines.append(f"| H2D (PCIe) | {h2d:.1f} GB/s | {h2d_peak:.0f} GB/s | {h2d_eff:.1f}% |")
lines.append(f"| D2H (PCIe) | {d2h:.1f} GB/s | {d2h_peak:.0f} GB/s | {d2h_eff:.1f}% |") lines.append(f"| D2H (PCIe) | {d2h:.1f} GB/s | {d2h_peak:.0f} GB/s | {d2h_eff:.1f}% |")
lines.append(f"| D2D (NVLink) | {d2d:.1f} GB/s | {d2d_peak:.0f} GB/s | {d2d_eff:.1f}% |") lines.append(f"| D2D (NVLink) | {d2d:.1f} GB/s | {d2d_peak:.0f} GB/s | {d2d_eff:.1f}% |")
@ -329,8 +329,8 @@ class ReportGenerator:
if stress and not stress.get("error"): if stress and not stress.get("error"):
lines.append("## Stress Test\n") lines.append("## Stress Test\n")
passed = stress.get("passed", False) passed = stress.get("passed", False)
duration = stress.get("duration_sec", 0) duration = stress.get("duration_sec") or 0
elapsed = stress.get("elapsed_sec", 0) elapsed = stress.get("elapsed_sec") or 0
source = stress.get("source", "unknown") source = stress.get("source", "unknown")
lines.append(f"- **Source:** {source}") lines.append(f"- **Source:** {source}")
lines.append(f"- **Duration:** {elapsed:.0f}s (requested {duration}s)") lines.append(f"- **Duration:** {elapsed:.0f}s (requested {duration}s)")
@ -432,7 +432,7 @@ class ReportGenerator:
if mem.get("error"): if mem.get("error"):
items.append(("Memory Bandwidth", f"ERROR: {mem['error']}")) items.append(("Memory Bandwidth", f"ERROR: {mem['error']}"))
else: else:
eff = mem.get("efficiency_pct", 0) eff = mem.get("efficiency_pct") or 0
verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 50 else "FAIL") verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 50 else "FAIL")
items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)")) items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))