From ef2ca11c58064c7cda5a869e056c4c4108484e22 Mon Sep 17 00:00:00 2001
From: zulifeng <lifeng.zu@d-robotics.cc>
Date: Sun, 10 May 2026 15:53:12 +0800
Subject: [PATCH] fix: resolve FP8 benchmark, NCCL parsing, and report
 None-value bugs

- benchmark.py: FP8 dtype now uses torch._scaled_mm() with scale tensors
  instead of torch.matmul() which does not support float8_e4m3fn on Hopper;
  fixes "addmm_cuda not implemented" error and enables FP8 TFLOPS measurement

- nccl_test.py: fix two bugs causing all-zero bandwidth results
  1. buffer size changed from -b 8 (8 bytes) to -b 8M -e 8G for meaningful load
  2. column parser corrected: parts[2] is dtype string not time value;
     now reads time=parts[5], algbw=parts[6], busbw=parts[7] per nccl-tests format

- report.py: replace .get(key, 0) with .get(key) or 0 at all bandwidth/stress
  fields to handle None values stored in result dicts (dict.get with default
  does not override an explicitly stored None)

- .gitignore: exclude .claude/settings.local.json

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .gitignore           |  1 +
 modules/benchmark.py | 36 +++++++++++++++++++++++++-----------
 modules/nccl_test.py | 15 +++++++++------
 modules/report.py    | 28 ++++++++++++++--------------
 4 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/.gitignore b/.gitignore
index 30a9cdb..934bb96 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,4 @@ reports/
 .venv/
 venv/
 .qoder/*
+.claude/settings.local.json
diff --git a/modules/benchmark.py b/modules/benchmark.py
index dce8b6c..ba91221 100644
--- a/modules/benchmark.py
+++ b/modules/benchmark.py
@@ -349,21 +349,35 @@ class Benchmark:
 
                     if dtype_name == "fp8":
                         a = torch.randn(M, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn)
-                        b = torch.randn(K, N, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn)
+                        b = torch.randn(N, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn)
+                        scale_a = torch.tensor(1.0, device="cuda")
+                        scale_b = torch.tensor(1.0, device="cuda")
+                        def _fp8_mm():
+                            return torch._scaled_mm(a, b.T, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.bfloat16)
                     else:
                         a = torch.randn(M, K, device="cuda", dtype=dtype_val)
                         b = torch.randn(K, N, device="cuda", dtype=dtype_val)
 
-                    for _ in range(warmup):
-                        torch.matmul(a, b)
-                    torch.cuda.synchronize()
-
-                    start_event = torch.cuda.Event(enable_timing=True)
-                    end_event = torch.cuda.Event(enable_timing=True)
-                    start_event.record()
-                    for _ in range(iterations):
-                        c = torch.matmul(a, b)
-                    end_event.record()
+                    if dtype_name == "fp8":
+                        for _ in range(warmup):
+                            _fp8_mm()
+                        torch.cuda.synchronize()
+                        start_event = torch.cuda.Event(enable_timing=True)
+                        end_event = torch.cuda.Event(enable_timing=True)
+                        start_event.record()
+                        for _ in range(iterations):
+                            c = _fp8_mm()
+                        end_event.record()
+                    else:
+                        for _ in range(warmup):
+                            torch.matmul(a, b)
+                        torch.cuda.synchronize()
+                        start_event = torch.cuda.Event(enable_timing=True)
+                        end_event = torch.cuda.Event(enable_timing=True)
+                        start_event.record()
+                        for _ in range(iterations):
+                            c = torch.matmul(a, b)
+                        end_event.record()
                     torch.cuda.synchronize()
 
                     elapsed_ms = start_event.elapsed_time(end_event)
diff --git a/modules/nccl_test.py b/modules/nccl_test.py
index 77ab2bd..ae10d60 100644
--- a/modules/nccl_test.py
+++ b/modules/nccl_test.py
@@ -150,8 +150,8 @@ class NCCLTest:
 
         cmd = [
             binary,
-            "-b", "8",
-            "-e", "256M",
+            "-b", "8M",
+            "-e", "8G",
             "-f", "2",
             "-g", str(gpu_count),
             "-w", "5",
@@ -239,12 +239,15 @@ class NCCLTest:
             if not line or line.startswith("#"):
                 continue
             parts = line.split()
-            if len(parts) >= 7:
+            # nccl-tests data lines: size count type redop root time algbw busbw #wrong [time algbw busbw #wrong]
+            if len(parts) >= 9:
                 try:
                     size = int(parts[0])
-                    algbw = float(parts[-3]) if len(parts) >= 3 else 0
-                    busbw = float(parts[-2]) if len(parts) >= 2 else 0
-                    time_us = float(parts[2]) if len(parts) >= 3 else 0
+                    # parts[2] is dtype string ('float'/'int32'/etc.), not a number
+                    # out-of-place columns: time=parts[5], algbw=parts[6], busbw=parts[7]
+                    time_us = float(parts[5])
+                    algbw = float(parts[6])
+                    busbw = float(parts[7])
                     size_results.append({
                         "size": size,
                         "time_us": time_us,
diff --git a/modules/report.py b/modules/report.py
index 9278eda..93ccbb8 100644
--- a/modules/report.py
+++ b/modules/report.py
@@ -256,20 +256,20 @@ class ReportGenerator:
             lines.append(f"Source: {mem_data.get('source', 'unknown')}\n")
             lines.append("| Metric | Value | Peak | Efficiency |")
             lines.append("|--------|-------|------|------------|")
-            d2d = mem_data.get("d2d_bandwidth_gbps", 0)
-            h2d = mem_data.get("h2d_bandwidth_gbps", 0)
-            d2h = mem_data.get("d2h_bandwidth_gbps", 0)
+            d2d = mem_data.get("d2d_bandwidth_gbps") or 0
+            h2d = mem_data.get("h2d_bandwidth_gbps") or 0
+            d2h = mem_data.get("d2h_bandwidth_gbps") or 0
             # New format with per-metric peaks
-            h2d_peak = mem_data.get("h2d_peak_gbps", 0)
-            d2h_peak = mem_data.get("d2h_peak_gbps", 0)
-            d2d_peak = mem_data.get("d2d_peak_gbps", 0)
-            h2d_eff = mem_data.get("h2d_efficiency_pct", 0)
-            d2h_eff = mem_data.get("d2h_efficiency_pct", 0)
-            d2d_eff = mem_data.get("d2d_efficiency_pct", 0)
+            h2d_peak = mem_data.get("h2d_peak_gbps") or 0
+            d2h_peak = mem_data.get("d2h_peak_gbps") or 0
+            d2d_peak = mem_data.get("d2d_peak_gbps") or 0
+            h2d_eff = mem_data.get("h2d_efficiency_pct") or 0
+            d2h_eff = mem_data.get("d2h_efficiency_pct") or 0
+            d2d_eff = mem_data.get("d2d_efficiency_pct") or 0
             # Fallback for old format
             if not d2d_peak:
-                d2d_peak = mem_data.get("peak_bandwidth_gbps", 0)
-                d2d_eff = mem_data.get("efficiency_pct", 0)
+                d2d_peak = mem_data.get("peak_bandwidth_gbps") or 0
+                d2d_eff = mem_data.get("efficiency_pct") or 0
             lines.append(f"| H2D (PCIe) | {h2d:.1f} GB/s | {h2d_peak:.0f} GB/s | {h2d_eff:.1f}% |")
             lines.append(f"| D2H (PCIe) | {d2h:.1f} GB/s | {d2h_peak:.0f} GB/s | {d2h_eff:.1f}% |")
             lines.append(f"| D2D (NVLink) | {d2d:.1f} GB/s | {d2d_peak:.0f} GB/s | {d2d_eff:.1f}% |")
@@ -329,8 +329,8 @@ class ReportGenerator:
         if stress and not stress.get("error"):
             lines.append("## Stress Test\n")
             passed = stress.get("passed", False)
-            duration = stress.get("duration_sec", 0)
-            elapsed = stress.get("elapsed_sec", 0)
+            duration = stress.get("duration_sec") or 0
+            elapsed = stress.get("elapsed_sec") or 0
             source = stress.get("source", "unknown")
             lines.append(f"- **Source:** {source}")
             lines.append(f"- **Duration:** {elapsed:.0f}s (requested {duration}s)")
@@ -432,7 +432,7 @@ class ReportGenerator:
             if mem.get("error"):
                 items.append(("Memory Bandwidth", f"ERROR: {mem['error']}"))
             else:
-                eff = mem.get("efficiency_pct", 0)
+                eff = mem.get("efficiency_pct") or 0
                 verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 50 else "FAIL")
                 items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))