fix: resolve FP8 benchmark, NCCL parsing, and report None-value bugs
- benchmark.py: FP8 dtype now uses torch._scaled_mm() with scale tensors
instead of torch.matmul() which does not support float8_e4m3fn on Hopper;
fixes "addmm_cuda not implemented" error and enables FP8 TFLOPS measurement
- nccl_test.py: fix two bugs causing all-zero bandwidth results
1. buffer size changed from -b 8 (8 bytes) to -b 8M -e 8G for meaningful load
2. column parser corrected: parts[2] is dtype string not time value;
now reads time=parts[5], algbw=parts[6], busbw=parts[7] per nccl-tests format
- report.py: replace .get(key, 0) with .get(key) or 0 at all bandwidth/stress
fields to handle None values stored in result dicts (dict.get with default
does not override an explicitly stored None)
- .gitignore: exclude .claude/settings.local.json
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
09f81973bc
commit
ef2ca11c58
1
.gitignore
vendored
1
.gitignore
vendored
@ -14,3 +14,4 @@ reports/
|
|||||||
.venv/
|
.venv/
|
||||||
venv/
|
venv/
|
||||||
.qoder/*
|
.qoder/*
|
||||||
|
.claude/settings.local.json
|
||||||
|
|||||||
@ -349,21 +349,35 @@ class Benchmark:
|
|||||||
|
|
||||||
if dtype_name == "fp8":
|
if dtype_name == "fp8":
|
||||||
a = torch.randn(M, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn)
|
a = torch.randn(M, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn)
|
||||||
b = torch.randn(K, N, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn)
|
b = torch.randn(N, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn)
|
||||||
|
scale_a = torch.tensor(1.0, device="cuda")
|
||||||
|
scale_b = torch.tensor(1.0, device="cuda")
|
||||||
|
def _fp8_mm():
|
||||||
|
return torch._scaled_mm(a, b.T, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.bfloat16)
|
||||||
else:
|
else:
|
||||||
a = torch.randn(M, K, device="cuda", dtype=dtype_val)
|
a = torch.randn(M, K, device="cuda", dtype=dtype_val)
|
||||||
b = torch.randn(K, N, device="cuda", dtype=dtype_val)
|
b = torch.randn(K, N, device="cuda", dtype=dtype_val)
|
||||||
|
|
||||||
for _ in range(warmup):
|
if dtype_name == "fp8":
|
||||||
torch.matmul(a, b)
|
for _ in range(warmup):
|
||||||
torch.cuda.synchronize()
|
_fp8_mm()
|
||||||
|
torch.cuda.synchronize()
|
||||||
start_event = torch.cuda.Event(enable_timing=True)
|
start_event = torch.cuda.Event(enable_timing=True)
|
||||||
end_event = torch.cuda.Event(enable_timing=True)
|
end_event = torch.cuda.Event(enable_timing=True)
|
||||||
start_event.record()
|
start_event.record()
|
||||||
for _ in range(iterations):
|
for _ in range(iterations):
|
||||||
c = torch.matmul(a, b)
|
c = _fp8_mm()
|
||||||
end_event.record()
|
end_event.record()
|
||||||
|
else:
|
||||||
|
for _ in range(warmup):
|
||||||
|
torch.matmul(a, b)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
start_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
end_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
start_event.record()
|
||||||
|
for _ in range(iterations):
|
||||||
|
c = torch.matmul(a, b)
|
||||||
|
end_event.record()
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
elapsed_ms = start_event.elapsed_time(end_event)
|
elapsed_ms = start_event.elapsed_time(end_event)
|
||||||
|
|||||||
@ -150,8 +150,8 @@ class NCCLTest:
|
|||||||
|
|
||||||
cmd = [
|
cmd = [
|
||||||
binary,
|
binary,
|
||||||
"-b", "8",
|
"-b", "8M",
|
||||||
"-e", "256M",
|
"-e", "8G",
|
||||||
"-f", "2",
|
"-f", "2",
|
||||||
"-g", str(gpu_count),
|
"-g", str(gpu_count),
|
||||||
"-w", "5",
|
"-w", "5",
|
||||||
@ -239,12 +239,15 @@ class NCCLTest:
|
|||||||
if not line or line.startswith("#"):
|
if not line or line.startswith("#"):
|
||||||
continue
|
continue
|
||||||
parts = line.split()
|
parts = line.split()
|
||||||
if len(parts) >= 7:
|
# nccl-tests data lines: size count type redop root time algbw busbw #wrong [time algbw busbw #wrong]
|
||||||
|
if len(parts) >= 9:
|
||||||
try:
|
try:
|
||||||
size = int(parts[0])
|
size = int(parts[0])
|
||||||
algbw = float(parts[-3]) if len(parts) >= 3 else 0
|
# parts[2] is dtype string ('float'/'int32'/etc.), not a number
|
||||||
busbw = float(parts[-2]) if len(parts) >= 2 else 0
|
# out-of-place columns: time=parts[5], algbw=parts[6], busbw=parts[7]
|
||||||
time_us = float(parts[2]) if len(parts) >= 3 else 0
|
time_us = float(parts[5])
|
||||||
|
algbw = float(parts[6])
|
||||||
|
busbw = float(parts[7])
|
||||||
size_results.append({
|
size_results.append({
|
||||||
"size": size,
|
"size": size,
|
||||||
"time_us": time_us,
|
"time_us": time_us,
|
||||||
|
|||||||
@ -256,20 +256,20 @@ class ReportGenerator:
|
|||||||
lines.append(f"Source: {mem_data.get('source', 'unknown')}\n")
|
lines.append(f"Source: {mem_data.get('source', 'unknown')}\n")
|
||||||
lines.append("| Metric | Value | Peak | Efficiency |")
|
lines.append("| Metric | Value | Peak | Efficiency |")
|
||||||
lines.append("|--------|-------|------|------------|")
|
lines.append("|--------|-------|------|------------|")
|
||||||
d2d = mem_data.get("d2d_bandwidth_gbps", 0)
|
d2d = mem_data.get("d2d_bandwidth_gbps") or 0
|
||||||
h2d = mem_data.get("h2d_bandwidth_gbps", 0)
|
h2d = mem_data.get("h2d_bandwidth_gbps") or 0
|
||||||
d2h = mem_data.get("d2h_bandwidth_gbps", 0)
|
d2h = mem_data.get("d2h_bandwidth_gbps") or 0
|
||||||
# New format with per-metric peaks
|
# New format with per-metric peaks
|
||||||
h2d_peak = mem_data.get("h2d_peak_gbps", 0)
|
h2d_peak = mem_data.get("h2d_peak_gbps") or 0
|
||||||
d2h_peak = mem_data.get("d2h_peak_gbps", 0)
|
d2h_peak = mem_data.get("d2h_peak_gbps") or 0
|
||||||
d2d_peak = mem_data.get("d2d_peak_gbps", 0)
|
d2d_peak = mem_data.get("d2d_peak_gbps") or 0
|
||||||
h2d_eff = mem_data.get("h2d_efficiency_pct", 0)
|
h2d_eff = mem_data.get("h2d_efficiency_pct") or 0
|
||||||
d2h_eff = mem_data.get("d2h_efficiency_pct", 0)
|
d2h_eff = mem_data.get("d2h_efficiency_pct") or 0
|
||||||
d2d_eff = mem_data.get("d2d_efficiency_pct", 0)
|
d2d_eff = mem_data.get("d2d_efficiency_pct") or 0
|
||||||
# Fallback for old format
|
# Fallback for old format
|
||||||
if not d2d_peak:
|
if not d2d_peak:
|
||||||
d2d_peak = mem_data.get("peak_bandwidth_gbps", 0)
|
d2d_peak = mem_data.get("peak_bandwidth_gbps") or 0
|
||||||
d2d_eff = mem_data.get("efficiency_pct", 0)
|
d2d_eff = mem_data.get("efficiency_pct") or 0
|
||||||
lines.append(f"| H2D (PCIe) | {h2d:.1f} GB/s | {h2d_peak:.0f} GB/s | {h2d_eff:.1f}% |")
|
lines.append(f"| H2D (PCIe) | {h2d:.1f} GB/s | {h2d_peak:.0f} GB/s | {h2d_eff:.1f}% |")
|
||||||
lines.append(f"| D2H (PCIe) | {d2h:.1f} GB/s | {d2h_peak:.0f} GB/s | {d2h_eff:.1f}% |")
|
lines.append(f"| D2H (PCIe) | {d2h:.1f} GB/s | {d2h_peak:.0f} GB/s | {d2h_eff:.1f}% |")
|
||||||
lines.append(f"| D2D (NVLink) | {d2d:.1f} GB/s | {d2d_peak:.0f} GB/s | {d2d_eff:.1f}% |")
|
lines.append(f"| D2D (NVLink) | {d2d:.1f} GB/s | {d2d_peak:.0f} GB/s | {d2d_eff:.1f}% |")
|
||||||
@ -329,8 +329,8 @@ class ReportGenerator:
|
|||||||
if stress and not stress.get("error"):
|
if stress and not stress.get("error"):
|
||||||
lines.append("## Stress Test\n")
|
lines.append("## Stress Test\n")
|
||||||
passed = stress.get("passed", False)
|
passed = stress.get("passed", False)
|
||||||
duration = stress.get("duration_sec", 0)
|
duration = stress.get("duration_sec") or 0
|
||||||
elapsed = stress.get("elapsed_sec", 0)
|
elapsed = stress.get("elapsed_sec") or 0
|
||||||
source = stress.get("source", "unknown")
|
source = stress.get("source", "unknown")
|
||||||
lines.append(f"- **Source:** {source}")
|
lines.append(f"- **Source:** {source}")
|
||||||
lines.append(f"- **Duration:** {elapsed:.0f}s (requested {duration}s)")
|
lines.append(f"- **Duration:** {elapsed:.0f}s (requested {duration}s)")
|
||||||
@ -432,7 +432,7 @@ class ReportGenerator:
|
|||||||
if mem.get("error"):
|
if mem.get("error"):
|
||||||
items.append(("Memory Bandwidth", f"ERROR: {mem['error']}"))
|
items.append(("Memory Bandwidth", f"ERROR: {mem['error']}"))
|
||||||
else:
|
else:
|
||||||
eff = mem.get("efficiency_pct", 0)
|
eff = mem.get("efficiency_pct") or 0
|
||||||
verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 50 else "FAIL")
|
verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 50 else "FAIL")
|
||||||
items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))
|
items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user