From ef2ca11c58064c7cda5a869e056c4c4108484e22 Mon Sep 17 00:00:00 2001 From: zulifeng Date: Sun, 10 May 2026 15:53:12 +0800 Subject: [PATCH] fix: resolve FP8 benchmark, NCCL parsing, and report None-value bugs - benchmark.py: FP8 dtype now uses torch._scaled_mm() with scale tensors instead of torch.matmul() which does not support float8_e4m3fn on Hopper; fixes "addmm_cuda not implemented" error and enables FP8 TFLOPS measurement - nccl_test.py: fix two bugs causing all-zero bandwidth results 1. buffer size changed from -b 8 (8 bytes) to -b 8M -e 8G for meaningful load 2. column parser corrected: parts[2] is dtype string not time value; now reads time=parts[5], algbw=parts[6], busbw=parts[7] per nccl-tests format - report.py: replace .get(key, 0) with .get(key) or 0 at all bandwidth/stress fields to handle None values stored in result dicts (dict.get with default does not override an explicitly stored None) - .gitignore: exclude .claude/settings.local.json Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 1 + modules/benchmark.py | 36 +++++++++++++++++++++++++----------- modules/nccl_test.py | 15 +++++++++------ modules/report.py | 28 ++++++++++++++-------------- 4 files changed, 49 insertions(+), 31 deletions(-) diff --git a/.gitignore b/.gitignore index 30a9cdb..934bb96 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ reports/ .venv/ venv/ .qoder/* +.claude/settings.local.json diff --git a/modules/benchmark.py b/modules/benchmark.py index dce8b6c..ba91221 100644 --- a/modules/benchmark.py +++ b/modules/benchmark.py @@ -349,21 +349,35 @@ class Benchmark: if dtype_name == "fp8": a = torch.randn(M, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn) - b = torch.randn(K, N, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn) + b = torch.randn(N, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn) + scale_a = torch.tensor(1.0, device="cuda") + scale_b = torch.tensor(1.0, device="cuda") + def _fp8_mm(): + return torch._scaled_mm(a, b.T, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.bfloat16) else: a = torch.randn(M, K, device="cuda", dtype=dtype_val) b = torch.randn(K, N, device="cuda", dtype=dtype_val) - for _ in range(warmup): - torch.matmul(a, b) - torch.cuda.synchronize() - - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - start_event.record() - for _ in range(iterations): - c = torch.matmul(a, b) - end_event.record() + if dtype_name == "fp8": + for _ in range(warmup): + _fp8_mm() + torch.cuda.synchronize() + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + start_event.record() + for _ in range(iterations): + c = _fp8_mm() + end_event.record() + else: + for _ in range(warmup): + torch.matmul(a, b) + torch.cuda.synchronize() + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + start_event.record() + for _ in range(iterations): + c = torch.matmul(a, b) + end_event.record() torch.cuda.synchronize() elapsed_ms = start_event.elapsed_time(end_event) diff --git a/modules/nccl_test.py b/modules/nccl_test.py index 77ab2bd..ae10d60 100644 --- a/modules/nccl_test.py +++ b/modules/nccl_test.py @@ -150,8 +150,8 @@ class NCCLTest: cmd = [ binary, - "-b", "8", - "-e", "256M", + "-b", "8M", + "-e", "8G", "-f", "2", "-g", str(gpu_count), "-w", "5", @@ -239,12 +239,15 @@ class NCCLTest: if not line or line.startswith("#"): continue parts = line.split() - if len(parts) >= 7: + # nccl-tests data lines: size count type redop root time algbw busbw #wrong [time algbw busbw #wrong] + if len(parts) >= 9: try: size = int(parts[0]) - algbw = float(parts[-3]) if len(parts) >= 3 else 0 - busbw = float(parts[-2]) if len(parts) >= 2 else 0 - time_us = float(parts[2]) if len(parts) >= 3 else 0 + # parts[2] is dtype string ('float'/'int32'/etc.), not a number + # out-of-place columns: time=parts[5], algbw=parts[6], busbw=parts[7] + time_us = float(parts[5]) + algbw = float(parts[6]) + busbw = float(parts[7]) size_results.append({ "size": size, "time_us": time_us, diff --git a/modules/report.py b/modules/report.py index 9278eda..93ccbb8 100644 --- a/modules/report.py +++ b/modules/report.py @@ -256,20 +256,20 @@ class ReportGenerator: lines.append(f"Source: {mem_data.get('source', 'unknown')}\n") lines.append("| Metric | Value | Peak | Efficiency |") lines.append("|--------|-------|------|------------|") - d2d = mem_data.get("d2d_bandwidth_gbps", 0) - h2d = mem_data.get("h2d_bandwidth_gbps", 0) - d2h = mem_data.get("d2h_bandwidth_gbps", 0) + d2d = mem_data.get("d2d_bandwidth_gbps") or 0 + h2d = mem_data.get("h2d_bandwidth_gbps") or 0 + d2h = mem_data.get("d2h_bandwidth_gbps") or 0 # New format with per-metric peaks - h2d_peak = mem_data.get("h2d_peak_gbps", 0) - d2h_peak = mem_data.get("d2h_peak_gbps", 0) - d2d_peak = mem_data.get("d2d_peak_gbps", 0) - h2d_eff = mem_data.get("h2d_efficiency_pct", 0) - d2h_eff = mem_data.get("d2h_efficiency_pct", 0) - d2d_eff = mem_data.get("d2d_efficiency_pct", 0) + h2d_peak = mem_data.get("h2d_peak_gbps") or 0 + d2h_peak = mem_data.get("d2h_peak_gbps") or 0 + d2d_peak = mem_data.get("d2d_peak_gbps") or 0 + h2d_eff = mem_data.get("h2d_efficiency_pct") or 0 + d2h_eff = mem_data.get("d2h_efficiency_pct") or 0 + d2d_eff = mem_data.get("d2d_efficiency_pct") or 0 # Fallback for old format if not d2d_peak: - d2d_peak = mem_data.get("peak_bandwidth_gbps", 0) - d2d_eff = mem_data.get("efficiency_pct", 0) + d2d_peak = mem_data.get("peak_bandwidth_gbps") or 0 + d2d_eff = mem_data.get("efficiency_pct") or 0 lines.append(f"| H2D (PCIe) | {h2d:.1f} GB/s | {h2d_peak:.0f} GB/s | {h2d_eff:.1f}% |") lines.append(f"| D2H (PCIe) | {d2h:.1f} GB/s | {d2h_peak:.0f} GB/s | {d2h_eff:.1f}% |") lines.append(f"| D2D (NVLink) | {d2d:.1f} GB/s | {d2d_peak:.0f} GB/s | {d2d_eff:.1f}% |") @@ -329,8 +329,8 @@ class ReportGenerator: if stress and not stress.get("error"): lines.append("## Stress Test\n") passed = stress.get("passed", False) - duration = stress.get("duration_sec", 0) - elapsed = stress.get("elapsed_sec", 0) + duration = stress.get("duration_sec") or 0 + elapsed = stress.get("elapsed_sec") or 0 source = stress.get("source", "unknown") lines.append(f"- **Source:** {source}") lines.append(f"- **Duration:** {elapsed:.0f}s (requested {duration}s)") @@ -432,7 +432,7 @@ class ReportGenerator: if mem.get("error"): items.append(("Memory Bandwidth", f"ERROR: {mem['error']}")) else: - eff = mem.get("efficiency_pct", 0) + eff = mem.get("efficiency_pct") or 0 verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 50 else "FAIL") items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))