fix: resolve FP8 benchmark, NCCL parsing, and report None-value bugs

- benchmark.py: FP8 dtype now uses torch._scaled_mm() with scale tensors instead of torch.matmul() which does not support float8_e4m3fn on Hopper; fixes "addmm_cuda not implemented" error and enables FP8 TFLOPS measurement - nccl_test.py: fix two bugs causing all-zero bandwidth results 1. buffer size changed from -b 8 (8 bytes) to -b 8M -e 8G for meaningful load 2. column parser corrected: parts[2] is dtype string not time value; now reads time=parts[5], algbw=parts[6], busbw=parts[7] per nccl-tests format - report.py: replace .get(key, 0) with .get(key) or 0 at all bandwidth/stress fields to handle None values stored in result dicts (dict.get with default does not override an explicitly stored None) - .gitignore: exclude .claude/settings.local.json Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-10 15:53:12 +08:00 · 2026-05-10 15:53:12 +08:00 · ef2ca11c58
commit ef2ca11c58
parent 09f81973bc
4 changed files with 49 additions and 31 deletions
--- a/.gitignore
+++ b/.gitignore
@ -14,3 +14,4 @@ reports/
 .venv/
 venv/
 .qoder/*
 .claude/settings.local.json
--- a/modules/benchmark.py
+++ b/modules/benchmark.py
@ -349,15 +349,29 @@ class Benchmark:
                    if dtype_name == "fp8":
                        a = torch.randn(M, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn)
-                        b = torch.randn(K, N, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn)
+                        b = torch.randn(N, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn)
                        scale_a = torch.tensor(1.0, device="cuda")
                        scale_b = torch.tensor(1.0, device="cuda")
                        def _fp8_mm():
                            return torch._scaled_mm(a, b.T, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.bfloat16)
                    else:
                        a = torch.randn(M, K, device="cuda", dtype=dtype_val)
                        b = torch.randn(K, N, device="cuda", dtype=dtype_val)
                    if dtype_name == "fp8":
                        for _ in range(warmup):
                            _fp8_mm()
                        torch.cuda.synchronize()
                        start_event = torch.cuda.Event(enable_timing=True)
                        end_event = torch.cuda.Event(enable_timing=True)
                        start_event.record()
                        for _ in range(iterations):
                            c = _fp8_mm()
                        end_event.record()
                    else:
                        for _ in range(warmup):
                            torch.matmul(a, b)
                        torch.cuda.synchronize()
                        start_event = torch.cuda.Event(enable_timing=True)
                        end_event = torch.cuda.Event(enable_timing=True)
                        start_event.record()
--- a/modules/nccl_test.py
+++ b/modules/nccl_test.py
@ -150,8 +150,8 @@ class NCCLTest:
        cmd = [
            binary,
-            "-b", "8",
+            "-b", "8M",
-            "-e", "256M",
+            "-e", "8G",
            "-f", "2",
            "-g", str(gpu_count),
            "-w", "5",
@ -239,12 +239,15 @@ class NCCLTest:
            if not line or line.startswith("#"):
                continue
            parts = line.split()
-            if len(parts) >= 7:
+            # nccl-tests data lines: size count type redop root time algbw busbw #wrong [time algbw busbw #wrong]
            if len(parts) >= 9:
                try:
                    size = int(parts[0])
-                    algbw = float(parts[-3]) if len(parts) >= 3 else 0
+                    # parts[2] is dtype string ('float'/'int32'/etc.), not a number
-                    busbw = float(parts[-2]) if len(parts) >= 2 else 0
+                    # out-of-place columns: time=parts[5], algbw=parts[6], busbw=parts[7]
-                    time_us = float(parts[2]) if len(parts) >= 3 else 0
+                    time_us = float(parts[5])
                    algbw = float(parts[6])
                    busbw = float(parts[7])
                    size_results.append({
                        "size": size,
                        "time_us": time_us,
--- a/modules/report.py
+++ b/modules/report.py
@ -256,20 +256,20 @@ class ReportGenerator:
            lines.append(f"Source: {mem_data.get('source', 'unknown')}\n")
            lines.append("| Metric | Value | Peak | Efficiency |")
            lines.append("|--------|-------|------|------------|")
-            d2d = mem_data.get("d2d_bandwidth_gbps", 0)
+            d2d = mem_data.get("d2d_bandwidth_gbps") or 0
-            h2d = mem_data.get("h2d_bandwidth_gbps", 0)
+            h2d = mem_data.get("h2d_bandwidth_gbps") or 0
-            d2h = mem_data.get("d2h_bandwidth_gbps", 0)
+            d2h = mem_data.get("d2h_bandwidth_gbps") or 0
            # New format with per-metric peaks
-            h2d_peak = mem_data.get("h2d_peak_gbps", 0)
+            h2d_peak = mem_data.get("h2d_peak_gbps") or 0
-            d2h_peak = mem_data.get("d2h_peak_gbps", 0)
+            d2h_peak = mem_data.get("d2h_peak_gbps") or 0
-            d2d_peak = mem_data.get("d2d_peak_gbps", 0)
+            d2d_peak = mem_data.get("d2d_peak_gbps") or 0
-            h2d_eff = mem_data.get("h2d_efficiency_pct", 0)
+            h2d_eff = mem_data.get("h2d_efficiency_pct") or 0
-            d2h_eff = mem_data.get("d2h_efficiency_pct", 0)
+            d2h_eff = mem_data.get("d2h_efficiency_pct") or 0
-            d2d_eff = mem_data.get("d2d_efficiency_pct", 0)
+            d2d_eff = mem_data.get("d2d_efficiency_pct") or 0
            # Fallback for old format
            if not d2d_peak:
-                d2d_peak = mem_data.get("peak_bandwidth_gbps", 0)
+                d2d_peak = mem_data.get("peak_bandwidth_gbps") or 0
-                d2d_eff = mem_data.get("efficiency_pct", 0)
+                d2d_eff = mem_data.get("efficiency_pct") or 0
            lines.append(f"| H2D (PCIe) | {h2d:.1f} GB/s | {h2d_peak:.0f} GB/s | {h2d_eff:.1f}% |")
            lines.append(f"| D2H (PCIe) | {d2h:.1f} GB/s | {d2h_peak:.0f} GB/s | {d2h_eff:.1f}% |")
            lines.append(f"| D2D (NVLink) | {d2d:.1f} GB/s | {d2d_peak:.0f} GB/s | {d2d_eff:.1f}% |")
@ -329,8 +329,8 @@ class ReportGenerator:
        if stress and not stress.get("error"):
            lines.append("## Stress Test\n")
            passed = stress.get("passed", False)
-            duration = stress.get("duration_sec", 0)
+            duration = stress.get("duration_sec") or 0
-            elapsed = stress.get("elapsed_sec", 0)
+            elapsed = stress.get("elapsed_sec") or 0
            source = stress.get("source", "unknown")
            lines.append(f"- **Source:** {source}")
            lines.append(f"- **Duration:** {elapsed:.0f}s (requested {duration}s)")
@ -432,7 +432,7 @@ class ReportGenerator:
            if mem.get("error"):
                items.append(("Memory Bandwidth", f"ERROR: {mem['error']}"))
            else:
-                eff = mem.get("efficiency_pct", 0)
+                eff = mem.get("efficiency_pct") or 0
                verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 50 else "FAIL")
                items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))