Add H100 acceptance PR summary

Add H100 acceptance delivery manifest
Add H100 network escalation request
2026-05-23 20:37:19 +08:00 · 2026-05-23 20:34:01 +08:00 · 2026-05-23 20:29:19 +08:00 · 2026-05-23 20:25:39 +08:00 · 2026-05-23 20:22:15 +08:00 · 2026-05-23 20:15:01 +08:00
20 changed files with 557 additions and 2517 deletions
--- a/configs/default.yaml
+++ b/configs/default.yaml
@ -1,4 +1,4 @@
-# GPU type: auto-detect or override to a100/a800/h100/h800/h200/h20/b200/b300
+# GPU type: auto-detect or override to a100/a800/h100/h200/b200/b300
 gpu_type: auto

 benchmark:
@ -14,25 +14,12 @@ benchmark:
      - fp16
      - bf16
      - fp8
-    # MAMF-style shape sweep: measure each dtype at every shape below and keep the max
-    # TFLOPS (the realistic achievable peak). A single fixed shape under-reports by
-    # ~7-12% and can't meet the MAMF-calibrated thresholds in gpu_specs.py.
-    # Each entry is either N (square N×N×N) or [M, N, K]. K-heavy non-square shapes
-    # (e.g. 2048×2048×13312) hit the true Hopper MAMF — bf16 ~790 vs ~755 square.
-    # Empty list => single matrix_size shape (legacy behaviour).
-    sweep_sizes:
-      - 3584
-      - 4608
-      - 5376
-      - 8192
-      - 11520
-      - [2048, 2048, 13312]
-      - [2048, 2048, 16384]
-    matrix_size: 8192        # fallback shape when sweep_sizes is empty
-    warmup: 20
-    iterations: 80
-    # NOTE: torch.compile was dropped — on H100 eager cuBLAS beats Triton for plain
-    # GEMM, and compiling would re-autotune per shape and make the sweep very slow.
+      - fp64
+      - int8
+    matrix_size: 8192
+    warmup: 50
+    iterations: 500
+    use_compile: true

 health:
  temp_warning: 75
@ -44,9 +31,15 @@ nccl:
  test_allreduce: true
  test_alltoall: true
  test_broadcast: true
-  test_reduce_scatter: false
-  test_allgather: false
-  test_sendrecv: false
+  test_reduce_scatter: true
+  test_allgather: true
+  test_sendrecv: true
+  message_sizes:
+    - 1M
+    - 256M
+    - 2G
+  repeats: 3
+  max_stddev_pct: 3

 multinode_nccl:
  enabled: false
@ -98,44 +91,66 @@ multinode_nccl:
    alltoall: 75

 stress:
-  duration_sec: 600           # 10 min — reaches thermal steady state, validates throttle/jitter beyond warmup
+  duration_sec: 1800
+  production_duration_sec: 1800
+  use_gpu_burn: false
  use_doubles: false
  use_tensor_cores: true
  memory_pct: 90
  gpus: all
+  dtype: bf16
+  matrix_size: 24576
+  telemetry_interval_sec: 1
+  warmup_sec: 60
+  min_steady_samples: 10
+  max_temp_c: 80
+  max_temp_delta_c: 5
+  min_power_watts: 630
+  max_tflops_jitter_pct: 5
+  require_tflops_jitter: true

 rdma:
-  min_bandwidth_gbps: 50
-  max_latency_us: 10
+  min_bandwidth_gbps: 47
+  min_port_rate_gbps: 400
+  max_latency_us: 3.5
+  max_write_latency_us: 2.0
+  max_read_latency_us: 3.5
  ib_iterations: 1000
-  msg_size: 65536
+  msg_size: 4194304
+  latency_msg_size: 8
  ib_device: null
  ib_port: 1
-  # Cross-node (two-host) RDMA via perftest, orchestrated over SSH from the CLIENT
-  # node. Replaces the old scripts/rdma_cross_node.sh. Run on the client; it starts
-  # ib_write_bw/ib_write_lat servers on `server` over SSH (passwordless required),
-  # then drives the local client per device.
-  cross_node:
-    enabled: false              # set true on the client node to run cross-node RDMA
-    server: null                # peer ssh address, e.g. 172.72.8.12 (server node)
-    server_addr: null           # OOB addr client connects to (default: = server)
-    ssh_user: root
-    devices: []                 # e.g. [mlx5_0, mlx5_1, mlx5_6, mlx5_7]; [] = auto-detect active IB
-    ib_port: 1
-    gid_index: null             # -x <n> for RoCE; null for pure InfiniBand
-    msg_size: 1048576           # 1 MiB — large enough to reach NDR400 peak
-    iters: 5000
-    base_oob_port: 18515        # per-device OOB port = base + device index
-    server_warmup_sec: 2.0
-    min_bandwidth_gbps: 350     # per-port PASS floor (NDR400 ≈ 0.9 × 400)
-    max_latency_us: 5
+  server_addr: null
+  ibping_target: null
+  ibping_count: 5
+  role: auto
+  pfc_ecn_counters: true
+
+nvlink:
+  expected_links_per_gpu: 18
+  expected_link_speed_gbps: 25
+  require_zero_errors: true
+
+dcgm:
+  diag_level: 3
+  timeout_sec: 3600
+  expected_num_gpus: 8
+  json_output: true
+  require_subtests: true

 training:
-  model: gpt2
+  model: synthetic_1.5b
  batch_size: 8
  seq_length: 2048
  num_steps: 50
+  warmup_steps: 5
  dtype: bf16
+  mode: ddp
+  synthetic_params_b: 1.5
+  min_tokens_per_sec: 45000
+  max_step_jitter_pct: 3
+  max_peak_memory_gb: 70
+  require_distributed: true

 report:
  output_dir: ./reports
--- a/modules/benchmark.py
+++ b/modules/benchmark.py
@ -312,31 +312,10 @@ class Benchmark:
    def run_compute_benchmark(self, dtypes: Optional[List[str]] = None) -> dict:
        comp_cfg = self.bench_cfg.get("compute", {})
        configured_dtypes = dtypes or comp_cfg.get("dtypes", ["fp32", "tf32", "fp16", "bf16", "fp8"])
-
-        # MAMF-style shape sweep (à la stas00's mamf-finder): a single fixed matmul
-        # shape under-reports the achievable peak by ~7-12% and therefore can't meet
-        # the MAMF-calibrated PASS thresholds in gpu_specs.compute_pass_thresholds_tflops.
-        # So for each dtype we time several matmul shapes and keep the MAXIMUM TFLOPS
-        # (the realistic peak). matrix_size is the fallback when sweep_sizes is empty.
-        matrix_size = comp_cfg.get("matrix_size", 8192)
-        sweep_sizes = comp_cfg.get("sweep_sizes") or [matrix_size]
-        warmup = comp_cfg.get("warmup", 20)
-        iterations = comp_cfg.get("iterations", 80)
-
-        # Each sweep entry is either an int N (square N×N×N) or an [M, N, K] triple.
-        # Non-square / K-heavy shapes (e.g. 2048×2048×13312) reach the true MAMF peak
-        # on Hopper — square-only tops out ~5% lower — so the default set mixes both.
-        def _to_shape(entry):
-            if isinstance(entry, (list, tuple)):
-                if len(entry) == 3:
-                    return tuple(int(x) for x in entry)
-                if len(entry) == 1:
-                    n = int(entry[0])
-                    return (n, n, n)
-                raise ValueError(f"sweep size {entry!r} must be an int or [M, N, K]")
-            n = int(entry)
-            return (n, n, n)
-        shapes = [_to_shape(e) for e in sweep_sizes]
+        matrix_size = comp_cfg.get("matrix_size", 4096)
+        warmup = comp_cfg.get("warmup", 10)
+        iterations = comp_cfg.get("iterations", 100)
+        use_compile = comp_cfg.get("use_compile", False)

        if not TORCH_AVAILABLE:
            self.console.print("[yellow]PyTorch not available - skipping compute benchmark[/yellow]")
@ -344,22 +323,37 @@ class Benchmark:

        gpu_count = torch.cuda.device_count()
        self.console.print(f"[cyan]Compute Benchmark - {gpu_count} GPU(s)[/cyan]")
-        if len(sweep_sizes) > 1:
-            self.console.print(
-                f"[cyan]  MAMF shape sweep over {len(sweep_sizes)} sizes: "
-                f"{', '.join(str(s) for s in sweep_sizes)}[/cyan]"
-            )
+
+        # torch.compile(max-autotune) benchmarks cuBLAS vs Triton kernels and picks
+        # the fastest for this GPU/shape, typically improving efficiency by 8-15%.
+        # compile_warmup must be larger than warmup to absorb JIT + autotuning time.
+        mm_fn = torch.matmul
+        compile_warmup = warmup
+        if use_compile:
+            try:
+                _compiled = torch.compile(torch.matmul, mode="max-autotune")
+                # Trial call to trigger JIT and verify compilation succeeds before the dtype loop.
+                _t = torch.randn(64, 64, device="cuda", dtype=torch.float32)
+                _compiled(_t, _t)
+                torch.cuda.synchronize()
+                del _t
+                mm_fn = _compiled
+                compile_warmup = max(warmup, 50)
+                self.console.print("[cyan]  torch.compile(max-autotune) enabled[/cyan]")
+            except Exception as e:
+                self.console.print(f"[yellow]  torch.compile unavailable ({type(e).__name__}), using eager[/yellow]")

        dtype_map = {
-            "fp32": (torch.float32, self.specs["fp32_tflops"]),
-            "tf32": ("tf32", self.specs["tf32_tflops"]),
-            "fp16": (torch.float16, self.specs["fp16_tflops"]),
-            "bf16": (torch.bfloat16, self.specs["bf16_tflops"]),
-            "fp8": (torch.float8_e4m3fn, self.specs["fp8_tflops"]),
+            "fp32": (torch.float32, self.specs.get("fp32_tflops", 0)),
+            "tf32": ("tf32", self.specs.get("tf32_tflops", 0)),
+            "fp16": (torch.float16, self.specs.get("fp16_tflops", 0)),
+            "bf16": (torch.bfloat16, self.specs.get("bf16_tflops", 0)),
+            "fp8": (getattr(torch, "float8_e4m3fn", None), self.specs.get("fp8_tflops", 0)),
+            "fp64": (torch.float64, self.specs.get("fp64_tflops", 0)),
+            "int8": (torch.int8, self.specs.get("int8_tflops", 0)),
        }

        results_by_dtype = {}
-        best_shapes = {}
        per_gpu_results = [{"index": i} for i in range(gpu_count)]

        with Progress(
@ -382,41 +376,27 @@ class Benchmark:
                    progress.advance(task)
                    continue

-                dtype_val, peak_tflops = dtype_map[dtype_name]
-
-                # allow_tf32 only affects float32 matmuls: ON for the TF32 run, OFF for
-                # the true-FP32 run so the two stay distinct.
-                old_tf32 = torch.backends.cuda.matmul.allow_tf32
-                if dtype_name == "tf32":
-                    torch.backends.cuda.matmul.allow_tf32 = True
-                    dtype_val = torch.float32
-                elif dtype_name == "fp32":
-                    torch.backends.cuda.matmul.allow_tf32 = False
-
-                best_tflops, best_shape, last_err = 0.0, None, None
-                for (M, N, K) in shapes:
+                gpu_values = []
+                errors = []
+                for gpu_idx in range(gpu_count):
                    try:
-                        t = self._bench_matmul_once(dtype_name, dtype_val, M, N, K, warmup, iterations)
-                        if t > best_tflops:
-                            best_tflops, best_shape = t, (M, N, K)
-                    except Exception as e:  # noqa: BLE001 - record and try the next shape
-                        last_err = e
-
-                torch.backends.cuda.matmul.allow_tf32 = old_tf32
-
-                if best_shape is None:
-                    results_by_dtype[dtype_name] = f"error: {last_err}"
-                    self.console.print(f"[yellow]  {dtype_name}: {last_err}[/yellow]")
-                else:
-                    shape_str = "x".join(str(d) for d in best_shape)
-                    results_by_dtype[dtype_name] = round(best_tflops, 1)
-                    best_shapes[dtype_name] = shape_str
-                    for pg in per_gpu_results:
-                        pg[dtype_name] = round(best_tflops, 1)
-                    if len(shapes) > 1:
-                        self.console.print(
-                            f"[dim]  {dtype_name}: {best_tflops:.1f} TFLOPS @ {shape_str}[/dim]"
+                        val = self._benchmark_dtype_on_gpu(
+                            dtype_name, dtype_map[dtype_name][0], matrix_size,
+                            warmup, compile_warmup, iterations, mm_fn, gpu_idx,
                        )
+                        gpu_values.append(val)
+                        per_gpu_results[gpu_idx][dtype_name] = round(val, 1)
+                    except Exception as e:
+                        errors.append(f"gpu{gpu_idx}: {e}")
+                        per_gpu_results[gpu_idx][dtype_name] = f"error: {e}"
+                    finally:
+                        torch.cuda.empty_cache()
+
+                if gpu_values:
+                    results_by_dtype[dtype_name] = round(sum(gpu_values) / len(gpu_values), 1)
+                else:
+                    results_by_dtype[dtype_name] = "error: " + "; ".join(errors[:3])
+                    self.console.print(f"[yellow]  {dtype_name}: {results_by_dtype[dtype_name]}[/yellow]")

                progress.advance(task)

@ -427,78 +407,119 @@ class Benchmark:
                if peak_tp:
                    efficiency[dt] = round((achieved / peak_tp) * 100, 1)

+        consistency = {}
+        for dt in results_by_dtype:
+            vals = [pg.get(dt) for pg in per_gpu_results]
+            nums = [v for v in vals if isinstance(v, (int, float))]
+            if len(nums) >= 2:
+                mean = sum(nums) / len(nums)
+                spread_pct = ((max(nums) - min(nums)) / mean * 100) if mean else 0
+                consistency[dt] = {
+                    "mean_tflops": round(mean, 1),
+                    "min_tflops": round(min(nums), 1),
+                    "max_tflops": round(max(nums), 1),
+                    "spread_pct": round(spread_pct, 2),
+                    "max_allowed_pct": 3,
+                    "passed": spread_pct <= 3,
+                }
+
+        pass_thresholds = dict(self.specs.get("compute_pass_thresholds_tflops") or {})
+        threshold_passed = True
+        for dt, threshold in pass_thresholds.items():
+            val = results_by_dtype.get(dt)
+            if not isinstance(val, (int, float)) or val < threshold:
+                threshold_passed = False
+                break
+        consistency_passed = all(row.get("passed", False) for row in consistency.values()) if consistency else True
+
        return {
            "compute": {
+                "passed": threshold_passed and consistency_passed,
                "per_dtype_tflops": results_by_dtype,
                "peak_tflops": {dt: dtype_map[dt][1] for dt in dtype_map},
                "efficiency_pct": efficiency,
                # Absolute TFLOPS PASS thresholds (decoupled from peak). When present,
                # report.py judges PASS/WARN/FAIL against these directly instead of
                # using % of peak. Empty dict => fall back to legacy 80% rule.
-                "pass_thresholds_tflops": dict(
-                    self.specs.get("compute_pass_thresholds_tflops") or {}
-                ),
+                "pass_thresholds_tflops": pass_thresholds,
                "per_gpu": per_gpu_results,
-                "sweep_sizes": list(sweep_sizes),
-                "best_shapes": best_shapes,
+                "consistency": consistency,
                "matrix_size": matrix_size,
                "warmup": warmup,
                "iterations": iterations,
            }
        }

-    def _bench_matmul_once(self, dtype_name: str, dtype_val, M: int, N: int, K: int,
-                           warmup: int, iterations: int) -> float:
-        """Time one (M×K)·(K×N) matmul for a dtype and return achieved TFLOPS.
-
-        Uses an L2-cache-busting pool of matrix pairs (total > 256 MB) so operands
-        can't be served from L2 across iterations, and CUDA events for timing. FP8
-        goes through torch._scaled_mm (e4m3); all others through torch.matmul — eager
-        cuBLAS, which on H100 beats torch.compile/Triton for plain GEMM and avoids the
-        per-shape recompile cost that would make a sweep pathologically slow.
-        """
-        elem_bytes = 1 if dtype_name == "fp8" else torch.tensor([], dtype=dtype_val).element_size()
-        pair_bytes = (M * K + K * N) * elem_bytes
-        num_pools = max(4, -(-256 * 1024 * 1024 // pair_bytes))  # ceil(256MB / pair)
-
-        if dtype_name == "fp8":
-            if not hasattr(torch, "_scaled_mm"):
-                raise RuntimeError("torch._scaled_mm unavailable — upgrade to PyTorch >= 2.1")
-            pools_a = [torch.randn(M, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn) for _ in range(num_pools)]
-            pools_b = [torch.randn(N, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn) for _ in range(num_pools)]
-            scale_a = torch.tensor(1.0, device="cuda")
-            scale_b = torch.tensor(1.0, device="cuda")
-            def op(i):
-                return torch._scaled_mm(pools_a[i], pools_b[i].T, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.bfloat16)
-        else:
-            pools_a = [torch.randn(M, K, device="cuda", dtype=dtype_val) for _ in range(num_pools)]
-            pools_b = [torch.randn(K, N, device="cuda", dtype=dtype_val) for _ in range(num_pools)]
-            def op(i):
-                return torch.matmul(pools_a[i], pools_b[i])
-
+    def _benchmark_dtype_on_gpu(self, dtype_name: str, dtype_val, matrix_size: int,
+                                warmup: int, compile_warmup: int, iterations: int,
+                                mm_fn, gpu_idx: int) -> float:
+        if dtype_name == "fp8" and dtype_val is None:
+            raise RuntimeError("torch.float8_e4m3fn unavailable")
+        device = f"cuda:{gpu_idx}"
+        old_tf32 = torch.backends.cuda.matmul.allow_tf32
        try:
-            # Probe once so a broken/unsupported kernel raises before the timed loop.
-            _probe = op(0)
-            torch.cuda.synchronize()
-            del _probe
+            with torch.cuda.device(gpu_idx):
+                if dtype_name == "tf32":
+                    torch.backends.cuda.matmul.allow_tf32 = True
+                    dtype_val = torch.float32

-            for i in range(warmup):
-                op(i % num_pools)
-            torch.cuda.synchronize()
+                M = N = K = matrix_size
+                if dtype_name == "int8" and M > 4096:
+                    # torch._int_mm on 8192 can be extremely memory hungry because the
+                    # output is int32. Keep it production-visible, but bounded.
+                    M = N = K = 4096

-            start_event = torch.cuda.Event(enable_timing=True)
-            end_event = torch.cuda.Event(enable_timing=True)
-            start_event.record()
-            for i in range(iterations):
-                op(i % num_pools)
-            end_event.record()
-            torch.cuda.synchronize()
-            elapsed_ms = start_event.elapsed_time(end_event)
+                elem_bytes = 1 if dtype_name in ("fp8", "int8") else torch.tensor([], dtype=dtype_val).element_size()
+                pair_bytes = 2 * M * K * elem_bytes
+                num_pools = max(4, -(-256 * 1024 * 1024 // pair_bytes))
+
+                if dtype_name == "fp8":
+                    if not hasattr(torch, "_scaled_mm"):
+                        raise RuntimeError("torch._scaled_mm unavailable")
+                    pools_a = [torch.randn(M, K, device=device, dtype=torch.float32).to(torch.float8_e4m3fn) for _ in range(num_pools)]
+                    pools_b = [torch.randn(N, K, device=device, dtype=torch.float32).to(torch.float8_e4m3fn) for _ in range(num_pools)]
+                    scale_a = torch.tensor(1.0, device=device)
+                    scale_b = torch.tensor(1.0, device=device)
+
+                    def run(i):
+                        return torch._scaled_mm(pools_a[i], pools_b[i].T, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.bfloat16)
+
+                    effective_warmup = warmup
+                elif dtype_name == "int8":
+                    if not hasattr(torch, "_int_mm"):
+                        raise RuntimeError("torch._int_mm unavailable")
+                    pools_a = [torch.randint(-128, 127, (M, K), device=device, dtype=torch.int8) for _ in range(num_pools)]
+                    pools_b = [torch.randint(-128, 127, (K, N), device=device, dtype=torch.int8) for _ in range(num_pools)]
+
+                    def run(i):
+                        return torch._int_mm(pools_a[i], pools_b[i])
+
+                    effective_warmup = warmup
+                else:
+                    pools_a = [torch.randn(M, K, device=device, dtype=dtype_val) for _ in range(num_pools)]
+                    pools_b = [torch.randn(K, N, device=device, dtype=dtype_val) for _ in range(num_pools)]
+
+                    def run(i):
+                        return mm_fn(pools_a[i], pools_b[i])
+
+                    effective_warmup = compile_warmup
+
+                for i in range(effective_warmup):
+                    run(i % num_pools)
+                torch.cuda.synchronize()
+                start_event = torch.cuda.Event(enable_timing=True)
+                end_event = torch.cuda.Event(enable_timing=True)
+                start_event.record()
+                for i in range(iterations):
+                    c = run(i % num_pools)
+                end_event.record()
+                torch.cuda.synchronize()
+                elapsed_ms = start_event.elapsed_time(end_event)
+                del pools_a, pools_b, c
+                flops = 2 * M * N * K * iterations
+                return flops / (elapsed_ms / 1000) / 1e12
        finally:
-            del pools_a, pools_b
-            torch.cuda.empty_cache()
-
-        return (2 * M * N * K * iterations) / (elapsed_ms / 1000) / 1e12
+            torch.backends.cuda.matmul.allow_tf32 = old_tf32

    @staticmethod
    def print_results(results: dict, console: Console = None):
@ -583,77 +604,24 @@ class Benchmark:
                              f"[{ec}]{ef:.1f}%[/{ec}]")
            c.print(table)

-    @staticmethod
-    def judge_compute(results: dict) -> dict:
-        """Judge compute results against pass_thresholds_tflops.
-
-        Single source of truth for the PASS/WARN/FAIL rule (same one report.py uses):
-        achieved >= thr -> PASS; >= 0.9*thr -> WARN; else FAIL. A string achieved value
-        (skipped/error) -> SKIP. A dtype without a threshold falls back to efficiency
-        (>=80 PASS / >=50 WARN / else FAIL).
-
-        Returns {"rows": [(dtype, achieved, threshold, status), ...], "verdict": str}.
-        """
-        comp = results.get("compute", results)
-        per_dtype = comp.get("per_dtype_tflops", {})
-        thresholds = comp.get("pass_thresholds_tflops", {}) or {}
-        eff = comp.get("efficiency_pct", {})
-        rank = {"PASS": 0, "WARN": 1, "FAIL": 2, "SKIP": 0}
-        rows, verdict = [], "PASS"
-        for dt, val in per_dtype.items():
-            thr = thresholds.get(dt)
-            if isinstance(val, str):
-                status = "SKIP"
-            elif thr:
-                status = "PASS" if val >= thr else ("WARN" if val >= thr * 0.9 else "FAIL")
-            else:
-                e = eff.get(dt, 0)
-                status = "PASS" if e >= 80 else ("WARN" if e >= 50 else "FAIL")
-            rows.append((dt, val, thr, status))
-            if rank[status] > rank[verdict]:
-                verdict = status
-        return {"rows": rows, "verdict": verdict}
-
-    @staticmethod
-    def print_compute_verdict(results: dict, console: Console = None) -> str:
-        """Print the PASS/WARN/FAIL table for compute results; return the verdict."""
-        c = console or Console()
-        judged = Benchmark.judge_compute(results)
-        color = {"PASS": "green", "WARN": "yellow", "FAIL": "red", "SKIP": "dim"}
-        c.print("\n[bold cyan]Compute Verdict (vs thresholds)[/bold cyan]")
-        for dt, val, thr, status in judged["rows"]:
-            val_s = f"{val:.1f}" if isinstance(val, (int, float)) else str(val)
-            thr_s = f">= {thr}" if thr else "(efficiency)"
-            c.print(f"  {dt.upper():>4}: {val_s:>8}  {thr_s:<12} [{color[status]}]{status}[/{color[status]}]")
-        v = judged["verdict"]
-        c.print(f"  [bold]VERDICT: [{color[v]}]{v}[/{color[v]}][/bold]")
-        return v
-
-
-def _run_cli() -> None:
-    """`python -m modules.benchmark` — run ONLY the compute-throughput benchmark."""
-    import argparse
-    from pathlib import Path
-
-    import yaml
-
-    repo_root = Path(__file__).resolve().parent.parent
-    parser = argparse.ArgumentParser(description="Run the compute-throughput benchmark only.")
-    parser.add_argument("--config", default=str(repo_root / "configs" / "default.yaml"),
-                        help="path to config YAML (default: configs/default.yaml)")
-    parser.add_argument("--json", action="store_true", help="also print raw JSON of the compute results")
-    args = parser.parse_args()
-
-    with open(args.config) as f:
-        config = yaml.safe_load(f) or {}
-
-    results = Benchmark(config).run_compute_benchmark()
-    Benchmark.print_results(results)
-    Benchmark.print_compute_verdict(results)
-
-    if args.json:
-        print("JSON_RESULT:" + json.dumps(results["compute"]))
-
-
-if __name__ == "__main__":
-    _run_cli()
+            consistency = comp.get("consistency", {})
+            if consistency:
+                t_cons = Table(title="Per-GPU Consistency", box=None, padding=(0, 1))
+                t_cons.add_column("DType", style="bold")
+                t_cons.add_column("Min", justify="right")
+                t_cons.add_column("Mean", justify="right")
+                t_cons.add_column("Max", justify="right")
+                t_cons.add_column("Spread", justify="right")
+                t_cons.add_column("Status", justify="right")
+                for dt, row in consistency.items():
+                    status = "PASS" if row.get("passed") else "FAIL"
+                    color = "green" if row.get("passed") else "red"
+                    t_cons.add_row(
+                        dt.upper(),
+                        f"{row.get('min_tflops', 0):.1f}",
+                        f"{row.get('mean_tflops', 0):.1f}",
+                        f"{row.get('max_tflops', 0):.1f}",
+                        f"{row.get('spread_pct', 0):.2f}%",
+                        f"[{color}]{status}[/{color}]",
+                    )
+                c.print(t_cons)
--- a/modules/gpu_specs.py
+++ b/modules/gpu_specs.py
@ -11,7 +11,6 @@ GPU_NAME_PATTERNS = {
    "A100": "a100",
    "A800": "a800",
    "H100": "h100",
-    "H800": "h800",  # H800 = H100 SXM with NVLink halved (400 GB/s) and FP64 restricted
    "H200": "h200",
    "H20":  "h20",   # H20 / H20-3e is the China-compliance export variant, REDUCED peaks
    "B200": "b200",
@ -36,17 +35,11 @@ GPU_SPECS = {
        "fp16_tflops": 990,                 # dense (1979 sparse w/ 2:4)
        "bf16_tflops": 990,                 # dense
        "fp8_tflops": 1979,                 # dense
+        "fp64_tflops": 67,
+        "int8_tflops": 1979,
        "compute_pass_thresholds_tflops": {
-            # Recalibrated 2026-05-25 to the H100 eager-cuBLAS achievable floor (each
-            # threshold ~2-4% below the sustained value measured across 16 GPUs via the
-            # MAMF shape sweep: fp32 ~52 / tf32 ~405 / fp16 ~732-748 / bf16 ~747-758 /
-            # fp8 ~1248-1271). The old marketing/MAMF-derived values (fp32 54, tf32 444,
-            # fp16 734, bf16 745, fp8 1400) sat ON or ABOVE what PyTorch cuBLAS reaches
-            # on H100, so healthy cards flaked to WARN/FAIL. fp8 1400 in particular was
-            # an H200/rowwise-scaling figure; H100 tensorwise _scaled_mm tops out ~1310.
-            "fp32": 50, "tf32": 385, "fp16": 720, "bf16": 730, "fp8": 1200,
-            # FP64 63 / INT8 1536 — listed for documentation; benchmark module
-            # doesn't currently exercise these dtypes.
+            "fp32": 54, "tf32": 444, "fp16": 734, "bf16": 745, "fp8": 1400,
+            "fp64": 63, "int8": 1536,
        },
        "tdp_watts": 700,
        "nvlink_gen": 4,
@ -67,51 +60,10 @@ GPU_SPECS = {
        "fp16_tflops": 990,                 # dense
        "bf16_tflops": 990,                 # dense
        "fp8_tflops": 1979,                 # dense
-        # PASS thresholds aligned with H200_production_acceptance.md v2 (2026-05-21):
-        # calibrated against Semianalysis & stas00 MAMF — H200 shares H100 SMs so
-        # achievable TFLOPS in PyTorch is in the same band.
-        "compute_pass_thresholds_tflops": {
-            "fp32": 50, "tf32": 400, "fp16": 720, "bf16": 720, "fp8": 1400,
-        },
        "tdp_watts": 700,
        "nvlink_gen": 4,
        "nvlink_bandwidth_gbps": 900,
        "pcie_gen": 5,
-        "min_driver_version": "545",
-        "min_cuda_version": "12.4",
-    },
-    "h800": {
-        # H800 = China-compliance export variant of H100 SXM5. SAME chip / SMs /
-        # clocks / HBM as H100 SXM5 — Tensor Core peaks (FP16 / BF16 / FP8 / TF32 /
-        # FP32) are identical to H100. Two restrictions vs H100:
-        #   1. NVLink bandwidth halved: 400 GB/s bidirectional (vs H100 900 GB/s)
-        #   2. FP64 throughput severely cut to ~1 TFLOPS (vs H100 34/67 TFLOPS)
-        # All other interfaces (PCIe Gen5, NVSwitch, HBM3 80GB @ 3.35 TB/s) match H100.
-        # NCCL multi-GPU thresholds MUST be downscaled because NVLink BW is halved.
-        "full_name": "NVIDIA H800 SXM5",
-        "architecture": "Hopper",
-        "compute_capability": 9.0,
-        "hbm_capacity_gb": 80,
-        "hbm_type": "HBM3",
-        "memory_bandwidth_gbps": 3350,      # GB/s (3.35 TB/s) — same as H100 SXM
-        "fp32_tflops": 67,
-        "tf32_tflops": 495,                 # dense (same as H100)
-        "fp16_tflops": 990,                 # dense (same as H100)
-        "bf16_tflops": 990,                 # dense (same as H100)
-        "fp8_tflops": 1979,                 # dense (same as H100)
-        # Tensor Core peaks identical to H100, so PASS thresholds reuse the H100
-        # eager-cuBLAS calibration (2026-05-25). Measured on 8×H800: fp32 ~52 /
-        # tf32 ~420 / fp16 ~741 / bf16 ~745 / fp8 ~1249 — all clear these. fp8 was
-        # 1400 (an H200/rowwise-scaling figure) which PyTorch tensorwise _scaled_mm
-        # can't reach on H100-class silicon (~1310 ceiling); lowered to 1200 to match
-        # h100. FP64 deliberately NOT listed — H800 is restricted to ~1 TFLOPS FP64.
-        "compute_pass_thresholds_tflops": {
-            "fp32": 50, "tf32": 385, "fp16": 720, "bf16": 730, "fp8": 1200,
-        },
-        "tdp_watts": 700,
-        "nvlink_gen": 4,
-        "nvlink_bandwidth_gbps": 400,       # bidirectional — HALF of H100 (export restriction)
-        "pcie_gen": 5,
        "min_driver_version": "535",
        "min_cuda_version": "12.1",
    },
--- a/modules/rdma_test.py
+++ b/modules/rdma_test.py
@ -1,15 +1,17 @@
 """RDMA / InfiniBand bandwidth and latency test module."""

+import glob
 import os
 import shutil
 import subprocess
-import time
 from datetime import datetime
 from typing import Optional, List

 from rich.console import Console
 from rich.table import Table

+from modules.gpu_specs import resolve_tools_dir
+

 class RDMATest:

@ -17,11 +19,24 @@ class RDMATest:
        self.config = config
        self.console = Console()
        self.rdma_cfg = config.get("rdma", {})
+        self.tools_dir = resolve_tools_dir(config)

    def _find_tool(self, name: str) -> Optional[str]:
        p = shutil.which(name)
        if p:
            return p
+        candidates = [
+            os.path.join(self.tools_dir, "perftest", name),
+            os.path.join(self.tools_dir, "perftest", "bin", name),
+            os.path.join(self.tools_dir, "rdma", name),
+            os.path.join(self.tools_dir, name),
+        ]
+        for path in candidates:
+            if os.path.isfile(path) and os.access(path, os.X_OK):
+                return path
+        for path in glob.glob(os.path.join(self.tools_dir, "**", name), recursive=True):
+            if os.path.isfile(path) and os.access(path, os.X_OK):
+                return path
        return None

    def _get_ib_devices(self) -> List[str]:
@ -101,26 +116,40 @@ class RDMATest:

        self.console.print(f"[cyan]RDMA Test - Devices: {', '.join(devices)}[/cyan]")

-        bw_results = self._run_bandwidth_tests(devices)
-        latency_results = self._run_latency_tests(devices)
+        active_pairs = [
+            (dev, port) for dev, port in ib_devices
+            if "ACTIVE" in self._read_sys(f"/sys/class/infiniband/{dev}/ports/{port}/state").upper()
+        ]
+        port_checks = self._evaluate_port_checks(device_info)
+        test_devices = [dev for dev, _ in active_pairs]
+        bw_results = self._run_bandwidth_tests(test_devices)
+        latency_results = self._run_latency_tests(test_devices)
+        ibping_results = self._run_ibping_tests(active_pairs)
+        fabric_counters = self._collect_pfc_ecn_counters() if self.rdma_cfg.get("pfc_ecn_counters", True) else {}
+        failures = self._failure_reasons(port_checks, bw_results, latency_results, ibping_results, fabric_counters)

+        fabric_counters_missing = (
+            self.rdma_cfg.get("pfc_ecn_counters", True)
+            and fabric_counters
+            and not fabric_counters.get("counters")
+        )
        all_passed = all(
            r.get("status") == "PASS"
-            for r in bw_results + latency_results
+            for r in bw_results + latency_results + ibping_results
            if isinstance(r, dict)
-        )
+        ) and all(p.get("status") == "PASS" for p in port_checks) and not fabric_counters.get("failed", False) and not fabric_counters_missing

-        result = {
+        return {
            "passed": all_passed,
            "devices": device_info,
+            "port_checks": port_checks,
            "bandwidth_tests": bw_results,
            "latency_tests": latency_results,
+            "ibping_tests": ibping_results,
+            "fabric_counters": fabric_counters,
+            "failures": failures,
            "timestamp": datetime.now().isoformat(),
        }
-        # Cross-node (two-host) RDMA, run only when a peer is configured.
-        if (self.rdma_cfg.get("cross_node", {}) or {}).get("enabled"):
-            result["cross_node"] = self.run_cross_node()
-        return result

    def _collect_device_info(self, devices: List[str]) -> List[dict]:
        info = []
@ -141,11 +170,83 @@ class RDMATest:
                            port_info[label] = f.read().strip()
                    except (FileNotFoundError, PermissionError):
                        port_info[label] = "N/A"
+                port_info["link_layer"] = self._read_sys(
+                    f"/sys/class/infiniband/{dev}/ports/{port}/link_layer"
+                ) or "N/A"

                dev_info["ports"].append(port_info)
            info.append(dev_info)
        return info

+    def _evaluate_port_checks(self, device_info: List[dict]) -> List[dict]:
+        checks = []
+        min_rate = float(self.rdma_cfg.get("min_port_rate_gbps", 400))
+        for dev in device_info:
+            for port in dev.get("ports", []):
+                if port.get("link_layer") != "InfiniBand":
+                    continue
+                state = port.get("state", "")
+                rate = port.get("rate", "")
+                rate_gbps = self._parse_rate_gbps(rate)
+                status = "PASS" if "ACTIVE" in state.upper() and rate_gbps >= min_rate else "FAIL"
+                checks.append({
+                    "device": dev.get("name"),
+                    "port": port.get("port"),
+                    "state": state,
+                    "rate": rate,
+                    "rate_gbps": rate_gbps,
+                    "min_rate_gbps": min_rate,
+                    "status": status,
+                })
+        return checks
+
+    @staticmethod
+    def _parse_rate_gbps(rate: str) -> float:
+        # Example: "400 Gb/sec (4X NDR)"
+        try:
+            return float(str(rate).split()[0])
+        except (ValueError, IndexError, AttributeError):
+            return 0.0
+
+    @staticmethod
+    def _failure_reasons(port_checks: List[dict], bw_results: List[dict],
+                         latency_results: List[dict], ibping_results: List[dict],
+                         fabric_counters: dict) -> List[str]:
+        failures = []
+        for p in port_checks:
+            if p.get("status") != "PASS":
+                failures.append(
+                    f"{p.get('device')} port {p.get('port')} state/rate failed "
+                    f"({p.get('state')}, {p.get('rate')}; required >= {p.get('min_rate_gbps')}Gbps ACTIVE)"
+                )
+        for r in bw_results:
+            if r.get("status") != "PASS":
+                if r.get("error"):
+                    failures.append(f"{r.get('test')} failed: {r.get('error')}")
+                else:
+                    failures.append(
+                        f"{r.get('test')} bandwidth {r.get('bandwidth_gbps', 0)}GB/s "
+                        f"< {r.get('min_required_gbps', 'N/A')}GB/s"
+                    )
+        for r in latency_results:
+            if r.get("status") != "PASS":
+                if r.get("error"):
+                    failures.append(f"{r.get('test')} failed: {r.get('error')}")
+                else:
+                    failures.append(
+                        f"{r.get('test')} latency {r.get('latency_us', 0)}us "
+                        f"> {r.get('max_allowed_us', 'N/A')}us"
+                    )
+        for r in ibping_results:
+            if r.get("status") != "PASS":
+                failures.append(f"{r.get('test')} failed: {r.get('error') or r.get('output_tail', '')[:120]}")
+        if fabric_counters.get("failed"):
+            nonzero = [f"{k}={v}" for k, v in fabric_counters.get("counters", {}).items() if v]
+            failures.append("non-zero PFC/ECN/CNP/congestion counters: " + ", ".join(nonzero[:10]))
+        elif fabric_counters and not fabric_counters.get("counters"):
+            failures.append("PFC/ECN/CNP/congestion counters not found; fabric counter evidence missing")
+        return failures
+
    def _run_ib_command(self, cmd: List[str], timeout: int = 60) -> dict:
        try:
            r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
@ -168,45 +269,70 @@ class RDMATest:
        iters = self.rdma_cfg.get("ib_iterations", 1000)
        dx = self.rdma_cfg.get("ib_device", None)
        port = self.rdma_cfg.get("ib_port", 1)
+        server_addr = self.rdma_cfg.get("server_addr") or os.environ.get("RDMA_SERVER_ADDR")
+        role = self.rdma_cfg.get("role", "auto")

        for tool, label in [(ib_write_bw, "ib_write_bw"), (ib_read_bw, "ib_read_bw")]:
            if not tool:
-                results.append({"test": label, "status": "SKIP", "error": "not installed"})
+                results.append({"test": label, "status": "FAIL", "error": "not installed"})
+                continue
+            if role == "client" and not server_addr:
+                results.append({
+                    "test": label,
+                    "status": "FAIL",
+                    "error": "rdma.role=client requires rdma.server_addr or RDMA_SERVER_ADDR",
+                    "role": "client",
+                })
                continue

-            server_cmd = [tool, "-d", dx or devices[0], "-i", str(port), "-s", str(msg_size)]
-            client_cmd = server_cmd + ["localhost"]
+            server_cmd = [tool, "-d", dx or devices[0], "-i", str(port), "-s", str(msg_size), "-n", str(iters)]
+            client_cmd = server_cmd + [server_addr or "localhost"]

-            server = subprocess.Popen(server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-            import time
-            time.sleep(1)
+            if role == "server":
+                results.append(self._run_server_mode(label, server_cmd))
+                continue
+
+            server = None
+            if not server_addr and role != "client":
+                server = subprocess.Popen(server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+                import time
+                time.sleep(1)

            try:
                client = subprocess.run(client_cmd, capture_output=True, text=True, timeout=60)
-                server.wait(timeout=10)
+                if server:
+                    server.wait(timeout=10)

-                output = client.stdout + server.stdout.read() if server.stdout else ""
-                bw_mbps = 0
+                output = client.stdout
+                if server and server.stdout:
+                    output += server.stdout.read()
+                bw_mibps = 0
                for line in output.split("\n"):
                    line = line.strip()
                    if not line:
                        continue
                    parts = line.split()
                    try:
-                        bw_mbps = max(bw_mbps, float(parts[-1]))
+                        if len(parts) >= 5 and int(parts[0]) == int(msg_size):
+                            # perftest bandwidth rows:
+                            # #bytes #iterations BW peak[MiB/sec] BW average[MiB/sec] MsgRate[Mpps]
+                            bw_mibps = max(bw_mibps, float(parts[3]))
                    except (ValueError, IndexError):
                        continue

-                bw_gbps = bw_mbps / 1000 if bw_mbps else 0
-                status = "PASS" if bw_gbps >= min_bw else "WARN"
+                bw_gbps = bw_mibps * 1024 * 1024 / 1e9 if bw_mibps else 0
+                status = "PASS" if bw_gbps >= min_bw else "FAIL"
                results.append({
                    "test": label,
                    "status": status,
                    "bandwidth_gbps": round(bw_gbps, 2),
                    "min_required_gbps": min_bw,
+                    "msg_size": msg_size,
+                    "role": "client" if server_addr else "local_loopback",
                })
            except Exception as e:
-                server.kill()
+                if server:
+                    server.kill()
                results.append({"test": label, "status": "FAIL", "error": str(e)})

        return results
@ -216,240 +342,214 @@ class RDMATest:
        ib_write_lat = self._find_tool("ib_write_lat")
        ib_read_lat = self._find_tool("ib_read_lat")
        max_lat_us = self.rdma_cfg.get("max_latency_us", 10)
+        max_by_test = {
+            "ib_write_lat": self.rdma_cfg.get("max_write_latency_us", max_lat_us),
+            "ib_read_lat": self.rdma_cfg.get("max_read_latency_us", max_lat_us),
+        }
        dx = self.rdma_cfg.get("ib_device", None)
        port = self.rdma_cfg.get("ib_port", 1)
+        msg_size = self.rdma_cfg.get("latency_msg_size", 8)
+        iters = self.rdma_cfg.get("ib_iterations", 1000)
+        server_addr = self.rdma_cfg.get("server_addr") or os.environ.get("RDMA_SERVER_ADDR")
+        role = self.rdma_cfg.get("role", "auto")

        for tool, label in [(ib_write_lat, "ib_write_lat"), (ib_read_lat, "ib_read_lat")]:
            if not tool:
-                results.append({"test": label, "status": "SKIP", "error": "not installed"})
+                results.append({"test": label, "status": "FAIL", "error": "not installed"})
+                continue
+            if role == "client" and not server_addr:
+                results.append({
+                    "test": label,
+                    "status": "FAIL",
+                    "error": "rdma.role=client requires rdma.server_addr or RDMA_SERVER_ADDR",
+                    "role": "client",
+                })
                continue

-            server_cmd = [tool, "-d", dx or devices[0], "-i", str(port)]
-            client_cmd = server_cmd + ["localhost"]
+            server_cmd = [tool, "-d", dx or devices[0], "-i", str(port), "-s", str(msg_size), "-n", str(iters)]
+            client_cmd = server_cmd + [server_addr or "localhost"]

-            server = subprocess.Popen(server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-            import time
-            time.sleep(1)
+            if role == "server":
+                results.append(self._run_server_mode(label, server_cmd))
+                continue
+
+            server = None
+            if not server_addr and role != "client":
+                server = subprocess.Popen(server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+                import time
+                time.sleep(1)

            try:
                client = subprocess.run(client_cmd, capture_output=True, text=True, timeout=60)
-                server.wait(timeout=10)
+                if server:
+                    server.wait(timeout=10)

-                output = client.stdout + server.stdout.read() if server.stdout else ""
+                output = client.stdout
+                if server and server.stdout:
+                    output += server.stdout.read()
                lat_us = 0
                for line in output.split("\n"):
                    parts = line.strip().split()
                    try:
-                        lat_us = max(lat_us, float(parts[-1]))
+                        if len(parts) >= 6:
+                            int(parts[0])
+                            int(parts[1])
+                            # perftest latency rows:
+                            # #bytes #iterations t_min t_max t_typical t_avg t_stdev p99 p99.9
+                            lat_us = max(lat_us, float(parts[5]))
                    except (ValueError, IndexError):
                        continue

-                status = "PASS" if 0 < lat_us <= max_lat_us else ("WARN" if lat_us > 0 else "FAIL")
+                max_allowed = max_by_test[label]
+                status = "PASS" if 0 < lat_us <= max_allowed else "FAIL"
                results.append({
                    "test": label,
                    "status": status,
                    "latency_us": round(lat_us, 2),
-                    "max_allowed_us": max_lat_us,
+                    "max_allowed_us": max_allowed,
+                    "msg_size": msg_size,
+                    "role": "client" if server_addr else "local_loopback",
                })
            except Exception as e:
-                server.kill()
+                if server:
+                    server.kill()
                results.append({"test": label, "status": "FAIL", "error": str(e)})

        return results

-    # ------------------------------------------------------------------
-    # Cross-node (two-host) RDMA over perftest, orchestrated via SSH.
-    # Runs FROM the client host: for each IB device it launches the matching
-    # perftest server on the peer over SSH (held open in a live ssh channel),
-    # then runs the local client against the peer's OOB address and parses the
-    # result. Replaces the old standalone scripts/rdma_cross_node.sh.
-    # ------------------------------------------------------------------
-
-    def _active_ib_devices(self) -> List[str]:
-        """IB devices whose port 1 is InfiniBand link_layer and ACTIVE."""
-        out = []
-        for dev in self._get_ib_devices():
-            for port in self._get_ib_ports(dev):
-                ll = self._read_sys(f"/sys/class/infiniband/{dev}/ports/{port}/link_layer")
-                st = self._read_sys(f"/sys/class/infiniband/{dev}/ports/{port}/state")
-                if ll == "InfiniBand" and "ACTIVE" in st.upper():
-                    out.append(dev)
-                    break
-        return out
-
-    def run_cross_node(self) -> dict:
-        cn = self.rdma_cfg.get("cross_node", {}) or {}
-        if not cn.get("enabled"):
-            return {"status": "SKIP", "skipped": True,
-                    "reason": "rdma.cross_node.enabled is false"}
-
-        server = cn.get("server")
-        if not server:
-            return {"status": "SKIP", "skipped": True,
-                    "reason": "rdma.cross_node.server (peer ssh address) not set"}
-
-        ssh_user = cn.get("ssh_user", "root")
-        server_target = server if "@" in server else f"{ssh_user}@{server}"
-        # OOB address the client's perftest connects to (defaults to the ssh host).
-        server_addr = cn.get("server_addr") or server.split("@")[-1]
-        ib_port = cn.get("ib_port", 1)
-        gid_index = cn.get("gid_index")
-        msg_size = cn.get("msg_size", 1048576)
-        iters = cn.get("iters", 5000)
-        base_port = cn.get("base_oob_port", 18515)
-        warmup = cn.get("server_warmup_sec", 2.0)
-        min_bw = cn.get("min_bandwidth_gbps", 350)
-        max_lat = cn.get("max_latency_us", 5)
-
-        devices = cn.get("devices") or self._active_ib_devices()
-        if not devices:
-            return {"status": "SKIP", "skipped": True,
-                    "reason": "no active InfiniBand devices to test"}
-
-        has_bw = self._find_tool("ib_write_bw") is not None
-        has_lat = self._find_tool("ib_write_lat") is not None
-        if not has_bw and not has_lat:
-            return {"status": "SKIP", "skipped": True,
-                    "reason": "perftest (ib_write_bw / ib_write_lat) not installed"}
-
-        self.console.print(
-            f"[cyan]Cross-node RDMA — client → {server_addr}, "
-            f"devices: {', '.join(devices)}[/cyan]")
-
-        per_device = []
-        for idx, dev in enumerate(devices):
-            oob = base_port + idx
-            entry = {"device": dev}
-
-            if has_bw:
-                bw = self._cross_node_perftest(
-                    "ib_write_bw", dev, server_target, server_addr, ib_port,
-                    oob, gid_index, warmup,
-                    extra=["--report_gbits", "-s", str(msg_size), "-n", str(iters)],
-                    parse="bw")
-                entry["bandwidth_gbps"] = bw
-                if isinstance(bw, (int, float)):
-                    entry["bw_status"] = "PASS" if bw >= min_bw else "WARN"
-                else:
-                    entry["bw_status"] = "FAIL"
-
-            if has_lat:
-                lat = self._cross_node_perftest(
-                    "ib_write_lat", dev, server_target, server_addr, ib_port,
-                    oob, gid_index, warmup, extra=[], parse="lat")
-                if isinstance(lat, dict):
-                    entry["latency_us"] = lat.get("typical")
-                    entry["latency_p99_us"] = lat.get("p99")
-                    t = lat.get("typical")
-                    entry["lat_status"] = ("PASS" if isinstance(t, (int, float)) and 0 < t <= max_lat
-                                           else ("WARN" if isinstance(t, (int, float)) else "FAIL"))
-                else:
-                    entry["latency_us"] = lat
-                    entry["lat_status"] = "FAIL"
-
-            per_device.append(entry)
-
-        statuses = [e.get(k) for e in per_device for k in ("bw_status", "lat_status") if e.get(k)]
-        verdict = "PASS"
-        for s in statuses:
-            if s == "FAIL":
-                verdict = "FAIL"
-                break
-            if s == "WARN" and verdict == "PASS":
-                verdict = "WARN"
-
-        return {
-            "status": verdict,
-            "server": server_addr,
-            "min_bandwidth_gbps": min_bw,
-            "max_latency_us": max_lat,
-            "per_device": per_device,
-            "timestamp": datetime.now().isoformat(),
-        }
-
-    def _cross_node_perftest(self, tool: str, dev: str, server_target: str,
-                             server_addr: str, ib_port: int, oob_port: int,
-                             gid_index, warmup: float, extra: List[str], parse: str):
-        """Start `tool` server on the peer via SSH, run the local client, parse output.
-
-        Returns a float (bw, Gb/s), a dict {typical, p99} (lat, µs), or an error string.
-        """
-        tool_path = self._find_tool(tool)
-        if not tool_path:
-            return f"{tool} not installed"
-
-        flags = ["-d", dev, "-i", str(ib_port), "-p", str(oob_port), "-F"]
-        if gid_index is not None:
-            flags += ["-x", str(gid_index)]
-        flags += extra
-
-        server_cmd = " ".join([tool] + flags)  # server: no host argument
-        server_proc = None
+    def _run_server_mode(self, label: str, server_cmd: List[str]) -> dict:
+        timeout = int(self.rdma_cfg.get("server_timeout_sec", 120))
        try:
-            server_proc = subprocess.Popen(
-                ["ssh", "-o", "BatchMode=yes", "-o", "StrictHostKeyChecking=no",
-                 server_target, server_cmd],
-                stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
-            time.sleep(warmup)  # let the remote server bind before the client connects
-
-            client = subprocess.run([tool_path] + flags + [server_addr],
-                                    capture_output=True, text=True, timeout=120)
-            out = client.stdout + "\n" + (client.stderr or "")
-            return self._parse_perftest_lat(out) if parse == "lat" else self._parse_perftest_bw(out)
+            r = subprocess.run(server_cmd, capture_output=True, text=True, timeout=timeout)
+            return {
+                "test": label,
+                "status": "PASS" if r.returncode == 0 else "FAIL",
+                "role": "server",
+                "server_timeout_sec": timeout,
+                "output_tail": (r.stdout + r.stderr)[-500:],
+            }
        except subprocess.TimeoutExpired:
-            return "timeout"
-        except Exception as e:  # noqa: BLE001
-            return f"error: {e}"
-        finally:
-            if server_proc and server_proc.poll() is None:
-                server_proc.terminate()
+            return {
+                "test": label,
+                "status": "PASS",
+                "role": "server",
+                "server_timeout_sec": timeout,
+                "note": "server ran until timeout waiting for client",
+            }
+        except Exception as e:
+            return {"test": label, "status": "FAIL", "role": "server", "error": str(e)}
+
+    def _run_ibping_tests(self, active_pairs: List[tuple[str, str]]) -> List[dict]:
+        tool = self._find_tool("ibping")
+        if not tool:
+            return [{"test": "ibping", "status": "FAIL", "error": "not installed"}]
+        if not active_pairs:
+            return [{"test": "ibping", "status": "FAIL", "error": "no active IB ports"}]
+
+        dev, port = active_pairs[0]
+        target = (
+            self.rdma_cfg.get("ibping_target")
+            or os.environ.get("IBPING_TARGET")
+        )
+        count = int(self.rdma_cfg.get("ibping_count", 5))
+        role = self.rdma_cfg.get("role", "auto")
+        server_addr = self.rdma_cfg.get("server_addr") or os.environ.get("RDMA_SERVER_ADDR")
+        base = [tool, "-C", dev, "-P", str(port)]
+
+        if role == "server":
+            return [self._run_server_mode("ibping", [*base, "-S"])]
+
+        server = None
+        if not target and role != "client":
+            target = self._read_sys(f"/sys/class/infiniband/{dev}/ports/{port}/lid")
+            server = subprocess.Popen([*base, "-S"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+            import time
+            time.sleep(1)
+
+        if not target:
+            reason = "no ibping target/lid"
+            if role == "client" or server_addr:
+                reason = (
+                    "cross-node ibping requires rdma.ibping_target or IBPING_TARGET "
+                    "(peer LID/GID; rdma.server_addr is only for perftest TCP bootstrap)"
+                )
+            return [{"test": "ibping", "status": "FAIL", "error": reason}]
+
+        try:
+            r = subprocess.run([*base, "-c", str(count), str(target)], capture_output=True, text=True, timeout=30)
+            if server:
+                server.terminate()
                try:
-                    server_proc.wait(timeout=5)
+                    server.wait(timeout=5)
+                except subprocess.TimeoutExpired:
+                    server.kill()
+            output = r.stdout + r.stderr
+            failed = r.returncode != 0 or "failed" in output.lower()
+            return [{
+                "test": "ibping",
+                "status": "FAIL" if failed else "PASS",
+                "role": "client" if server_addr or role == "client" else "local_loopback",
+                "direction": "outbound_to_peer" if server_addr or role == "client" else "local_loopback",
+                "target": str(target),
+                "count": count,
+                "output_tail": output[-500:],
+            }]
+        except Exception as e:
+            if server:
+                server.kill()
+            return [{"test": "ibping", "status": "FAIL", "error": str(e)}]
+
+    def _collect_pfc_ecn_counters(self) -> dict:
+        counters = {}
+        failed = False
+        keywords = ("pfc", "ecn", "cnp", "congestion")
+        for root, _, files in os.walk("/sys/class/infiniband"):
+            for name in files:
+                lower = name.lower()
+                if not any(k in lower for k in keywords):
+                    continue
+                path = os.path.join(root, name)
+                val = self._read_sys(path)
+                try:
+                    num = int(val)
+                except ValueError:
+                    continue
+                rel = path.replace("/sys/class/infiniband/", "")
+                counters[rel] = num
+                if num != 0:
+                    failed = True
+
+        ethtool = shutil.which("ethtool")
+        net_dir = "/sys/class/net"
+        if ethtool and os.path.isdir(net_dir):
+            for iface in sorted(os.listdir(net_dir)):
+                try:
+                    r = subprocess.run(
+                        [ethtool, "-S", iface],
+                        capture_output=True,
+                        text=True,
+                        timeout=10,
+                    )
                except Exception:
-                    server_proc.kill()
-            # ib_write_* server normally exits after one run; pkill cleans up a
-            # leftover one if the client failed mid-handshake. -x matches the exact
-            # process name so it never kills this ssh command itself.
-            try:
-                subprocess.run(
-                    ["ssh", "-o", "BatchMode=yes", server_target, f"pkill -x {tool}"],
-                    capture_output=True, timeout=10)
-            except Exception:
-                pass
-
-    @staticmethod
-    def _parse_perftest_bw(output: str) -> float:
-        """Parse ib_write_bw rows (#bytes #iter BW_peak BW_avg ...); return max BW avg."""
-        best = 0.0
-        for line in output.splitlines():
-            parts = line.split()
-            if len(parts) >= 4:
-                try:
-                    int(parts[0])              # #bytes column
-                    best = max(best, float(parts[3]))  # BW average[Gb/sec]
-                except ValueError:
                    continue
-        return round(best, 2) if best else 0.0
-
-    @staticmethod
-    def _parse_perftest_lat(output: str) -> dict:
-        """Parse ib_write_lat row (#bytes #iter t_min t_max t_typical t_avg ... 99%)."""
-        for line in output.splitlines():
-            parts = line.split()
-            if len(parts) >= 6:
-                try:
-                    int(parts[0]); int(parts[1])
-                    typical = float(parts[4])  # t_typical[usec]
-                except ValueError:
+                if r.returncode != 0:
                    continue
-                p99 = None
-                if len(parts) >= 8:
+                for line in r.stdout.splitlines():
+                    if ":" not in line:
+                        continue
+                    key, value = line.split(":", 1)
+                    key = key.strip()
+                    lower = key.lower()
+                    if not any(k in lower for k in keywords):
+                        continue
                    try:
-                        p99 = float(parts[7])  # 99% percentile[usec]
-                    except ValueError:
-                        p99 = None
-                return {"typical": round(typical, 2), "p99": round(p99, 2) if p99 else None}
-        return {"typical": None, "p99": None}
+                        num = int(value.strip().split()[0])
+                    except (ValueError, IndexError):
+                        continue
+                    counters[f"net/{iface}/{key}"] = num
+                    if num != 0:
+                        failed = True
+        return {"failed": failed, "counters": counters}

    @staticmethod
    def print_results(results: dict, console: Console = None):
@ -496,28 +596,10 @@ class RDMATest:
                        f"({lat:.2f} us, max: {t.get('max_allowed_us', 'N/A')} us)" if status != "SKIP"
                        else f"    {t['test']}: [dim]SKIPPED[/dim]")

-        cn = results.get("cross_node")
-        if cn:
-            if cn.get("skipped"):
-                c.print(f"\n  [bold]Cross-node RDMA[/bold]: [dim]SKIPPED "
-                        f"({cn.get('reason', '')})[/dim]")
-            else:
-                v = cn.get("status", "?")
-                vc = "green" if v == "PASS" else ("yellow" if v == "WARN" else "red")
-                c.print(f"\n  [bold]Cross-node RDMA[/bold] (server {cn.get('server')}) "
-                        f"[{vc}]{v}[/{vc}]  "
-                        f"[dim]min {cn.get('min_bandwidth_gbps')} Gb/s, "
-                        f"max {cn.get('max_latency_us')} µs[/dim]")
-                for e in cn.get("per_device", []):
-                    bw = e.get("bandwidth_gbps")
-                    lat = e.get("latency_us")
-                    bws = e.get("bw_status", "")
-                    lts = e.get("lat_status", "")
-                    bc = "green" if bws == "PASS" else ("yellow" if bws == "WARN" else "red")
-                    lc = "green" if lts == "PASS" else ("yellow" if lts == "WARN" else "red")
-                    bw_s = f"{bw:.1f} Gb/s" if isinstance(bw, (int, float)) else str(bw)
-                    lat_s = f"{lat:.2f} µs" if isinstance(lat, (int, float)) else str(lat)
-                    p99 = e.get("latency_p99_us")
-                    p99_s = f", p99 {p99:.2f}" if isinstance(p99, (int, float)) else ""
-                    c.print(f"    {e['device']}: BW [{bc}]{bw_s}[/{bc}] | "
-                            f"lat [{lc}]{lat_s}[/{lc}]{p99_s}")
+        ibping_tests = results.get("ibping_tests", [])
+        if ibping_tests:
+            c.print("\n  [bold]IB Ping Tests[/bold]")
+            for t in ibping_tests:
+                status = t.get("status", "FAIL")
+                sc = "green" if status == "PASS" else "red"
+                c.print(f"    {t['test']}: [{sc}]{status}[/{sc}] target={t.get('target', 'N/A')}")
--- a/reports_cublaslt_fp8_crosscheck_20260524.md
+++ b/reports_cublaslt_fp8_crosscheck_20260524.md
@ -1,87 +0,0 @@
-# cuBLASLt FP8 GEMM Cross-Check Report
-
-Date: 2026-05-24
-
-Scope: Validate whether the single-node FP8 compute FAIL is caused by hardware/platform limits or by the original PyTorch `_scaled_mm` benchmark path.
-
-## Method
-
-Added a direct cuBLASLt FP8 GEMM micro-benchmark:
-
- Source: `scripts/cublaslt_fp8_gemm_bench.cu`
- Wrapper: `scripts/run_cublaslt_fp8_gemm.sh`
- Input dtype: `CUDA_R_8F_E4M3`
- Output dtype: `CUDA_R_16BF`
- Accumulate / compute type: `CUBLAS_COMPUTE_32F`
- Layout: cuBLASLt FP8-required TN format
- Matrix size: `8192`
- Warmup: `50`
- Iterations: `500`
- GPUs: single-node 8 GPUs, measured one GPU at a time
-
-NVIDIA cuBLASLt documentation states FP8 kernels require TN format, `CUBLAS_COMPUTE_32F`, and `CUDA_R_32F` scale type. The implemented benchmark follows those constraints.
-
-## Results
-
-### aikubeworker0012 / nccl-gpu-1
-
-Raw report: `reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json`
-
-| GPU | FP8 TFLOPS |
-|---:|---:|
-| 0 | 1615.6 |
-| 1 | 1611.0 |
-| 2 | 1599.0 |
-| 3 | 1607.1 |
-| 4 | 1614.0 |
-| 5 | 1604.4 |
-| 6 | 1608.4 |
-| 7 | 1609.1 |
-
-Summary:
-
- Mean: `1608.6 TFLOPS`
- Min / Max: `1599.0 / 1615.6 TFLOPS`
- Spread: `1.03%`
- FP8 absolute threshold: `>= 1400 TFLOPS`
- Verdict against FP8 absolute threshold: **PASS**
- Verdict against 8-GPU consistency threshold `<= 3%`: **PASS**
-
-### aikubeworker0016 / nccl-gpu-2
-
-Raw report: `reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json`
-
-| GPU | FP8 TFLOPS |
-|---:|---:|
-| 0 | 1602.3 |
-| 1 | 1604.0 |
-| 2 | 1616.9 |
-| 3 | 1610.6 |
-| 4 | 1620.5 |
-| 5 | 1630.3 |
-| 6 | 1605.1 |
-| 7 | 1620.2 |
-
-Summary:
-
- Mean: `1613.7 TFLOPS`
- Min / Max: `1602.3 / 1630.3 TFLOPS`
- Spread: `1.74%`
- FP8 absolute threshold: `>= 1400 TFLOPS`
- Verdict against FP8 absolute threshold: **PASS**
- Verdict against 8-GPU consistency threshold `<= 3%`: **PASS**
-
-## Comparison With Existing PyTorch `_scaled_mm` Result
-
-| Host | PyTorch `_scaled_mm` FP8 | cuBLASLt FP8 | Delta |
-|---|---:|---:|---:|
-| aikubeworker0012 | 1170.4 | 1608.6 | +438.2 |
-| aikubeworker0016 | 1179.5 | 1613.7 | +434.2 |
-
-The cuBLASLt path passes the `>= 1400 TFLOPS` FP8 absolute threshold on both machines, while the original PyTorch `_scaled_mm` path remains around `1170-1180 TFLOPS`.
-
-## Conclusion
-
-The FP8 hardware path is capable of exceeding the configured H100 FP8 acceptance threshold on both machines. The earlier FP8 FAIL is therefore most likely a benchmark implementation issue in the current PyTorch `_scaled_mm` path, not a GPU hardware, power, clock, thermal, MIG, ECC, or Fabric Manager issue.
-
-Recommended next action: replace or augment the existing FP8 compute acceptance item with the cuBLASLt FP8 GEMM cross-check, while keeping the PyTorch `_scaled_mm` result as a secondary software-stack signal.
--- a/reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json
+++ b/reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json
@ -1,21 +0,0 @@
-{
-  "source": "cuBLASLt",
-  "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
-  "matrix_size": 8192,
-  "warmup": 50,
-  "iterations": 500,
-  "per_gpu": [
-    {"index": 0, "fp8_tflops": 1615.6},
-    {"index": 1, "fp8_tflops": 1611.0},
-    {"index": 2, "fp8_tflops": 1599.0},
-    {"index": 3, "fp8_tflops": 1607.1},
-    {"index": 4, "fp8_tflops": 1614.0},
-    {"index": 5, "fp8_tflops": 1604.4},
-    {"index": 6, "fp8_tflops": 1608.4},
-    {"index": 7, "fp8_tflops": 1609.1}
-  ],
-  "mean_tflops": 1608.6,
-  "min_tflops": 1599.0,
-  "max_tflops": 1615.6,
-  "spread_pct": 1.03
-}
--- a/reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json
+++ b/reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json
@ -1,21 +0,0 @@
-{
-  "source": "cuBLASLt",
-  "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
-  "matrix_size": 8192,
-  "warmup": 50,
-  "iterations": 500,
-  "per_gpu": [
-    {"index": 0, "fp8_tflops": 1602.3},
-    {"index": 1, "fp8_tflops": 1604.0},
-    {"index": 2, "fp8_tflops": 1616.9},
-    {"index": 3, "fp8_tflops": 1610.6},
-    {"index": 4, "fp8_tflops": 1620.5},
-    {"index": 5, "fp8_tflops": 1630.3},
-    {"index": 6, "fp8_tflops": 1605.1},
-    {"index": 7, "fp8_tflops": 1620.2}
-  ],
-  "mean_tflops": 1613.7,
-  "min_tflops": 1602.3,
-  "max_tflops": 1630.3,
-  "spread_pct": 1.74
-}
--- a/reports_fp8_path_comparison_20260525.md
+++ b/reports_fp8_path_comparison_20260525.md
@ -1,169 +0,0 @@
-# FP8 GEMM 路径对比测试报告
-
-测试日期：2026-05-25  
-测试节点：aikubeworker0012、aikubeworker0016  
-测试 GPU：NVIDIA H100 80GB HBM3  
-测试目标：对比同一 FP8 GEMM 规模下 PyTorch eager、CUDA Graph、Transformer Engine 和 direct cuBLASLt 的性能差异。
-
-## 一、测试结论
-
-本次 A-E 五条路径均已完成实测。
-
-核心结论：
-
-1. direct cuBLASLt 是本组测试里最快路径，两台机器分别达到 1626.6 TFLOPS 和 1598.1 TFLOPS。
-2. PyTorch eager `_scaled_mm` 默认路径约为 1161.9-1186.1 TFLOPS。
-3. 打开 `use_fast_accum=True` 后，PyTorch eager 路径有稳定提升，约提升 5.0%-6.7%。
-4. CUDA Graph + `_scaled_mm(use_fast_accum=True)` 进一步提升到 1277.7-1322.2 TFLOPS，但仍低于 direct cuBLASLt。
-5. Transformer Engine 本次使用的是 `te.Linear` + `fp8_autocast` 路径，不是裸 GEMM，因此包含 TE module、cast、FP8 recipe 等额外开销，结果低于 direct cuBLASLt，也低于 CUDA Graph `_scaled_mm`。
-
-这说明：当前 GPU 硬件和 cuBLASLt 裸 GEMM 能力本身没有问题；之前 PyTorch `_scaled_mm` 1170-1180 TFLOPS 左右的结果，主要反映的是 PyTorch eager 路径和当前 benchmark 方式下的端到端路径性能，而不是 GPU 算力极限。
-
-## 二、测试方法
-
-统一参数：
-
-| 参数 | 值 |
-|---|---:|
-| matrix_size | 8192 |
-| M/N/K | 8192/8192/8192 |
-| warmup | 50 |
-| iterations | 500 |
-| GPU index | 0 |
-| PyTorch | 2.6.0+cu124 |
-| CUDA | 12.4 |
-| 输入 dtype | FP8 E4M3 |
-| 输出 dtype | BF16 |
-| accumulation | FP32 |
-| scale_a / scale_b | 1.0 / 1.0 |
-
-测试路径定义：
-
-| 路径 | 名称 | 含义 |
-|---|---|---|
-| A | 当前 eager `_scaled_mm` | PyTorch 立即执行模式调用 `torch._scaled_mm`，默认 accumulation 参数 |
-| B | `_scaled_mm(use_fast_accum=True)` | PyTorch eager 路径，但显式打开 fast accumulation |
-| C | CUDA Graph + `_scaled_mm(use_fast_accum=True)` | 捕获并 replay 同一个 `_scaled_mm` 调用，降低 Python/PyTorch launch 间隙 |
-| D | Transformer Engine FP8 GEMM | `te.Linear` 在 `fp8_autocast` 下执行，包含 TE 层封装和 FP8 recipe 开销 |
-| E | direct cuBLASLt | C++/CUDA 直接调用 `cublasLtMatmul`，绕过 PyTorch eager |
-
-复现脚本：
-
-```bash
-MATRIX_SIZE=8192 WARMUP=50 ITERATIONS=500 GPU_INDEX=0 WORKSPACE_MB=256 \
-  /root/test_gpu_scripts/scripts/run_fp8_path_comparison.sh
-```
-
-## 三、实测结果
-
-### aikubeworker0012
-
-原始 JSON：`/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json`
-
-| 路径 | 状态 | TFLOPS | 单轮 CUDA event 时间 |
-|---|---|---:|---:|
-| A eager `_scaled_mm` default | OK | 1186.1 | 927.014 us |
-| B eager `_scaled_mm` fast_accum | OK | 1266.0 | 868.481 us |
-| C CUDA Graph + fast_accum | OK | 1322.2 | 831.573 us |
-| D Transformer Engine FP8 Linear | OK | 1153.2 | 953.478 us |
-| E direct cuBLASLt fast_accum | OK | 1626.6 | 未在 combined JSON 中记录 |
-
-相对 A 的提升：
-
-| 路径 | 相对 A |
-|---|---:|
-| B | +6.7% |
-| C | +11.5% |
-| D | -2.8% |
-| E | +37.1% |
-
-E 路径 cuBLASLt 算法信息：
-
-| 字段 | 值 |
-|---|---:|
-| algo_id | 52 |
-| tile_id | 23 |
-| splitk | 1 |
-| stages_id | 36 |
-| inner_shape_id | 0 |
-| cluster_shape_id | 3 |
-
-### aikubeworker0016
-
-原始 JSON：`/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json`
-
-| 路径 | 状态 | TFLOPS | 单轮 CUDA event 时间 |
-|---|---|---:|---:|
-| A eager `_scaled_mm` default | OK | 1161.9 | 946.313 us |
-| B eager `_scaled_mm` fast_accum | OK | 1220.4 | 900.960 us |
-| C CUDA Graph + fast_accum | OK | 1277.7 | 860.543 us |
-| D Transformer Engine FP8 Linear | OK | 1125.3 | 977.054 us |
-| E direct cuBLASLt fast_accum | OK | 1598.1 | 未在 combined JSON 中记录 |
-
-相对 A 的提升：
-
-| 路径 | 相对 A |
-|---|---:|
-| B | +5.0% |
-| C | +10.0% |
-| D | -3.2% |
-| E | +37.5% |
-
-E 路径 cuBLASLt 算法信息：
-
-| 字段 | 值 |
-|---|---:|
-| algo_id | 52 |
-| tile_id | 23 |
-| splitk | 1 |
-| stages_id | 36 |
-| inner_shape_id | 0 |
-| cluster_shape_id | 3 |
-
-## 四、对 PyTorch FP8 能否“上去”的判断
-
-从本次结果看，PyTorch FP8 路径可以通过两类方式上去：
-
-1. 打开更快的 math/accumulation 参数，例如 `use_fast_accum=True`。
-2. 使用 CUDA Graph replay，减少 eager 模式下每轮调度、enqueue 之间的间隙。
-
-但在当前 `matrix_size=8192`、单个 `_scaled_mm`、PyTorch eager/Graph benchmark 的测试形态下，PyTorch 路径仍没有达到 direct cuBLASLt 的 1598-1626 TFLOPS。也就是说，direct cuBLASLt 证明硬件和底层库有能力跑得更高；PyTorch eager `_scaled_mm` 测到的是 PyTorch 当前封装路径在这个 shape 下的实际表现。
-
-如果把目标定义为“让 PyTorch 代码路径更接近裸 cuBLASLt”，后续可以继续验证：
-
-1. 更大的 GEMM size，例如 16384。
-2. 固定 shape 后用 `torch.compile` 或 Inductor。
-3. CUDA Graph 覆盖更完整的 step，而不是只 replay 单个 op。
-4. 使用 Transformer Engine 的更底层 GEMM API 或官方 microbenchmark，而不是 `te.Linear` module forward。
-5. 对 `_scaled_mm` 做 Nsight Systems / Nsight Compute 抓取，确认实际 kernel、间隙和 cuBLASLt 算法选择。
-
-## 五、术语说明
-
-`eager` 指 PyTorch 立即执行模式。每次 Python 调用 `torch._scaled_mm`，PyTorch 都会经过 dispatcher、参数检查、Tensor 创建、准备 descriptor、调用 cuBLASLt heuristic，然后把 matmul enqueue 到 CUDA stream。
-
-`cuBLAS` 是 NVIDIA 的基础矩阵乘库。`cuBLASLt` 是更灵活的矩阵乘接口，支持更多 layout、FP8、算法 heuristic、workspace、epilogue 等能力。
-
-`direct cuBLASLt` 指我们自己写 C++/CUDA 直接调用 `cublasLtMatmul`，不经过 PyTorch eager，因此更接近裸 GEMM 峰值。
-
-`CUDA Graph` 指把一次 CUDA work 提前捕获成图，后续直接 replay，减少 CPU 侧反复 launch/调度带来的间隙。
-
-`Transformer Engine` 是 NVIDIA 面向 Transformer/FP8 训练优化的库。本次 D 路径使用的是 `te.Linear` module forward，不等同于裸 GEMM microbenchmark。
-
-## 六、文件清单
-
-本地脚本：
-
-| 文件 | 用途 |
-|---|---|
-| `/Users/d-robotics/lab/test_gpu_scripts/scripts/pytorch_fp8_path_bench.py` | A/B/C/D PyTorch 与 Transformer Engine 路径 |
-| `/Users/d-robotics/lab/test_gpu_scripts/scripts/cublaslt_fp8_gemm_bench.cu` | E direct cuBLASLt 路径 |
-| `/Users/d-robotics/lab/test_gpu_scripts/scripts/run_fp8_path_comparison.sh` | 统一运行并合并 A-E 结果 |
-
-本地结果：
-
-| 文件 | 用途 |
-|---|---|
-| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json` | aikubeworker0012 A-E 原始结果 |
-| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json` | aikubeworker0016 A-E 原始结果 |
-| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_path_comparison_20260525.md` | 本中文汇总报告 |
-
--- a/reports_fp8_paths_combined_aikubeworker0012_20260525_042347.json
+++ b/reports_fp8_paths_combined_aikubeworker0012_20260525_042347.json
@ -1,142 +0,0 @@
-{
-  "source": "fp8_path_comparison",
-  "host": null,
-  "matrix_size": 8192,
-  "gpu_index": 0,
-  "pytorch": {
-    "source": "pytorch_fp8_path_bench",
-    "torch": "2.6.0+cu124",
-    "cuda": "12.4",
-    "gpu_index": 0,
-    "gpu_name": "NVIDIA H100 80GB HBM3",
-    "matrix_size": 8192,
-    "warmup": 50,
-    "iterations": 500,
-    "results": [
-      {
-        "name": "A_eager_scaled_mm_default",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 465.145,
-        "event_us_per_iter": 930.29,
-        "wall_ms_total": 465.21,
-        "tflops": 1181.9
-      },
-      {
-        "name": "B_eager_scaled_mm_fast_accum",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 440.252,
-        "event_us_per_iter": 880.504,
-        "wall_ms_total": 440.289,
-        "tflops": 1248.7
-      },
-      {
-        "name": "C_cuda_graph_scaled_mm_fast_accum",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 3,
-        "event_ms_total": 415.631,
-        "event_us_per_iter": 831.262,
-        "wall_ms_total": 415.664,
-        "tflops": 1322.7
-      },
-      {
-        "name": "D_transformer_engine_fp8_linear",
-        "status": "unavailable",
-        "reason": "ModuleNotFoundError: No module named 'transformer_engine'"
-      }
-    ],
-    "summary": {
-      "max_tflops": 1322.7,
-      "min_tflops": 1181.9,
-      "mean_tflops": 1251.1
-    }
-  },
-  "cublaslt": {
-    "source": "cuBLASLt",
-    "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
-    "matrix_size": 8192,
-    "warmup": 50,
-    "iterations": 500,
-    "fast_accum": 1,
-    "per_gpu": [
-      {
-        "index": 0,
-        "fp8_tflops": 1615.4,
-        "algo_id": 52,
-        "tile_id": 23,
-        "splitk": 1,
-        "stages_id": 36,
-        "inner_shape_id": 0,
-        "cluster_shape_id": 3
-      }
-    ],
-    "mean_tflops": 1615.4,
-    "min_tflops": 1615.4,
-    "max_tflops": 1615.4,
-    "spread_pct": 0.0
-  },
-  "results": [
-    {
-      "name": "A_eager_scaled_mm_default",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 465.145,
-      "event_us_per_iter": 930.29,
-      "wall_ms_total": 465.21,
-      "tflops": 1181.9
-    },
-    {
-      "name": "B_eager_scaled_mm_fast_accum",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 440.252,
-      "event_us_per_iter": 880.504,
-      "wall_ms_total": 440.289,
-      "tflops": 1248.7
-    },
-    {
-      "name": "C_cuda_graph_scaled_mm_fast_accum",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 3,
-      "event_ms_total": 415.631,
-      "event_us_per_iter": 831.262,
-      "wall_ms_total": 415.664,
-      "tflops": 1322.7
-    },
-    {
-      "name": "D_transformer_engine_fp8_linear",
-      "status": "unavailable",
-      "reason": "ModuleNotFoundError: No module named 'transformer_engine'"
-    },
-    {
-      "index": 0,
-      "algo_id": 52,
-      "tile_id": 23,
-      "splitk": 1,
-      "stages_id": 36,
-      "inner_shape_id": 0,
-      "cluster_shape_id": 3,
-      "name": "E_direct_cublaslt_fast_accum",
-      "status": "ok",
-      "tflops": 1615.4,
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "fast_accum": 1,
-      "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
-    }
-  ]
-}
--- a/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json
+++ b/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json
@ -1,156 +0,0 @@
-{
-  "source": "fp8_path_comparison",
-  "host": null,
-  "matrix_size": 8192,
-  "gpu_index": 0,
-  "pytorch": {
-    "source": "pytorch_fp8_path_bench",
-    "torch": "2.6.0+cu124",
-    "cuda": "12.4",
-    "gpu_index": 0,
-    "gpu_name": "NVIDIA H100 80GB HBM3",
-    "matrix_size": 8192,
-    "warmup": 50,
-    "iterations": 500,
-    "results": [
-      {
-        "name": "A_eager_scaled_mm_default",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 463.507,
-        "event_us_per_iter": 927.014,
-        "wall_ms_total": 463.573,
-        "tflops": 1186.1
-      },
-      {
-        "name": "B_eager_scaled_mm_fast_accum",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 434.241,
-        "event_us_per_iter": 868.481,
-        "wall_ms_total": 434.492,
-        "tflops": 1266.0
-      },
-      {
-        "name": "C_cuda_graph_scaled_mm_fast_accum",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 3,
-        "event_ms_total": 415.786,
-        "event_us_per_iter": 831.573,
-        "wall_ms_total": 415.825,
-        "tflops": 1322.2
-      },
-      {
-        "name": "D_transformer_engine_fp8_linear",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 476.739,
-        "event_us_per_iter": 953.478,
-        "wall_ms_total": 476.8,
-        "tflops": 1153.2,
-        "note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
-      }
-    ],
-    "summary": {
-      "max_tflops": 1322.2,
-      "min_tflops": 1153.2,
-      "mean_tflops": 1231.9
-    }
-  },
-  "cublaslt": {
-    "source": "cuBLASLt",
-    "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
-    "matrix_size": 8192,
-    "warmup": 50,
-    "iterations": 500,
-    "fast_accum": 1,
-    "per_gpu": [
-      {
-        "index": 0,
-        "fp8_tflops": 1626.6,
-        "algo_id": 52,
-        "tile_id": 23,
-        "splitk": 1,
-        "stages_id": 36,
-        "inner_shape_id": 0,
-        "cluster_shape_id": 3
-      }
-    ],
-    "mean_tflops": 1626.6,
-    "min_tflops": 1626.6,
-    "max_tflops": 1626.6,
-    "spread_pct": 0.0
-  },
-  "results": [
-    {
-      "name": "A_eager_scaled_mm_default",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 463.507,
-      "event_us_per_iter": 927.014,
-      "wall_ms_total": 463.573,
-      "tflops": 1186.1
-    },
-    {
-      "name": "B_eager_scaled_mm_fast_accum",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 434.241,
-      "event_us_per_iter": 868.481,
-      "wall_ms_total": 434.492,
-      "tflops": 1266.0
-    },
-    {
-      "name": "C_cuda_graph_scaled_mm_fast_accum",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 3,
-      "event_ms_total": 415.786,
-      "event_us_per_iter": 831.573,
-      "wall_ms_total": 415.825,
-      "tflops": 1322.2
-    },
-    {
-      "name": "D_transformer_engine_fp8_linear",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 476.739,
-      "event_us_per_iter": 953.478,
-      "wall_ms_total": 476.8,
-      "tflops": 1153.2,
-      "note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
-    },
-    {
-      "index": 0,
-      "algo_id": 52,
-      "tile_id": 23,
-      "splitk": 1,
-      "stages_id": 36,
-      "inner_shape_id": 0,
-      "cluster_shape_id": 3,
-      "name": "E_direct_cublaslt_fast_accum",
-      "status": "ok",
-      "tflops": 1626.6,
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "fast_accum": 1,
-      "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
-    }
-  ]
-}
--- a/reports_fp8_paths_combined_aikubeworker0016_20260525_042402.json
+++ b/reports_fp8_paths_combined_aikubeworker0016_20260525_042402.json
@ -1,142 +0,0 @@
-{
-  "source": "fp8_path_comparison",
-  "host": null,
-  "matrix_size": 8192,
-  "gpu_index": 0,
-  "pytorch": {
-    "source": "pytorch_fp8_path_bench",
-    "torch": "2.6.0+cu124",
-    "cuda": "12.4",
-    "gpu_index": 0,
-    "gpu_name": "NVIDIA H100 80GB HBM3",
-    "matrix_size": 8192,
-    "warmup": 50,
-    "iterations": 500,
-    "results": [
-      {
-        "name": "A_eager_scaled_mm_default",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 470.909,
-        "event_us_per_iter": 941.817,
-        "wall_ms_total": 470.974,
-        "tflops": 1167.4
-      },
-      {
-        "name": "B_eager_scaled_mm_fast_accum",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 452.608,
-        "event_us_per_iter": 905.215,
-        "wall_ms_total": 452.647,
-        "tflops": 1214.6
-      },
-      {
-        "name": "C_cuda_graph_scaled_mm_fast_accum",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 3,
-        "event_ms_total": 427.724,
-        "event_us_per_iter": 855.449,
-        "wall_ms_total": 427.768,
-        "tflops": 1285.3
-      },
-      {
-        "name": "D_transformer_engine_fp8_linear",
-        "status": "unavailable",
-        "reason": "ModuleNotFoundError: No module named 'transformer_engine'"
-      }
-    ],
-    "summary": {
-      "max_tflops": 1285.3,
-      "min_tflops": 1167.4,
-      "mean_tflops": 1222.4
-    }
-  },
-  "cublaslt": {
-    "source": "cuBLASLt",
-    "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
-    "matrix_size": 8192,
-    "warmup": 50,
-    "iterations": 500,
-    "fast_accum": 1,
-    "per_gpu": [
-      {
-        "index": 0,
-        "fp8_tflops": 1594.3,
-        "algo_id": 52,
-        "tile_id": 23,
-        "splitk": 1,
-        "stages_id": 36,
-        "inner_shape_id": 0,
-        "cluster_shape_id": 3
-      }
-    ],
-    "mean_tflops": 1594.3,
-    "min_tflops": 1594.3,
-    "max_tflops": 1594.3,
-    "spread_pct": 0.0
-  },
-  "results": [
-    {
-      "name": "A_eager_scaled_mm_default",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 470.909,
-      "event_us_per_iter": 941.817,
-      "wall_ms_total": 470.974,
-      "tflops": 1167.4
-    },
-    {
-      "name": "B_eager_scaled_mm_fast_accum",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 452.608,
-      "event_us_per_iter": 905.215,
-      "wall_ms_total": 452.647,
-      "tflops": 1214.6
-    },
-    {
-      "name": "C_cuda_graph_scaled_mm_fast_accum",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 3,
-      "event_ms_total": 427.724,
-      "event_us_per_iter": 855.449,
-      "wall_ms_total": 427.768,
-      "tflops": 1285.3
-    },
-    {
-      "name": "D_transformer_engine_fp8_linear",
-      "status": "unavailable",
-      "reason": "ModuleNotFoundError: No module named 'transformer_engine'"
-    },
-    {
-      "index": 0,
-      "algo_id": 52,
-      "tile_id": 23,
-      "splitk": 1,
-      "stages_id": 36,
-      "inner_shape_id": 0,
-      "cluster_shape_id": 3,
-      "name": "E_direct_cublaslt_fast_accum",
-      "status": "ok",
-      "tflops": 1594.3,
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "fast_accum": 1,
-      "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
-    }
-  ]
-}
--- a/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json
+++ b/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json
@ -1,156 +0,0 @@
-{
-  "source": "fp8_path_comparison",
-  "host": null,
-  "matrix_size": 8192,
-  "gpu_index": 0,
-  "pytorch": {
-    "source": "pytorch_fp8_path_bench",
-    "torch": "2.6.0+cu124",
-    "cuda": "12.4",
-    "gpu_index": 0,
-    "gpu_name": "NVIDIA H100 80GB HBM3",
-    "matrix_size": 8192,
-    "warmup": 50,
-    "iterations": 500,
-    "results": [
-      {
-        "name": "A_eager_scaled_mm_default",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 473.156,
-        "event_us_per_iter": 946.313,
-        "wall_ms_total": 473.199,
-        "tflops": 1161.9
-      },
-      {
-        "name": "B_eager_scaled_mm_fast_accum",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 450.48,
-        "event_us_per_iter": 900.96,
-        "wall_ms_total": 450.505,
-        "tflops": 1220.4
-      },
-      {
-        "name": "C_cuda_graph_scaled_mm_fast_accum",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 3,
-        "event_ms_total": 430.272,
-        "event_us_per_iter": 860.543,
-        "wall_ms_total": 430.304,
-        "tflops": 1277.7
-      },
-      {
-        "name": "D_transformer_engine_fp8_linear",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 488.527,
-        "event_us_per_iter": 977.054,
-        "wall_ms_total": 488.576,
-        "tflops": 1125.3,
-        "note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
-      }
-    ],
-    "summary": {
-      "max_tflops": 1277.7,
-      "min_tflops": 1125.3,
-      "mean_tflops": 1196.3
-    }
-  },
-  "cublaslt": {
-    "source": "cuBLASLt",
-    "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
-    "matrix_size": 8192,
-    "warmup": 50,
-    "iterations": 500,
-    "fast_accum": 1,
-    "per_gpu": [
-      {
-        "index": 0,
-        "fp8_tflops": 1598.1,
-        "algo_id": 52,
-        "tile_id": 23,
-        "splitk": 1,
-        "stages_id": 36,
-        "inner_shape_id": 0,
-        "cluster_shape_id": 3
-      }
-    ],
-    "mean_tflops": 1598.1,
-    "min_tflops": 1598.1,
-    "max_tflops": 1598.1,
-    "spread_pct": 0.0
-  },
-  "results": [
-    {
-      "name": "A_eager_scaled_mm_default",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 473.156,
-      "event_us_per_iter": 946.313,
-      "wall_ms_total": 473.199,
-      "tflops": 1161.9
-    },
-    {
-      "name": "B_eager_scaled_mm_fast_accum",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 450.48,
-      "event_us_per_iter": 900.96,
-      "wall_ms_total": 450.505,
-      "tflops": 1220.4
-    },
-    {
-      "name": "C_cuda_graph_scaled_mm_fast_accum",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 3,
-      "event_ms_total": 430.272,
-      "event_us_per_iter": 860.543,
-      "wall_ms_total": 430.304,
-      "tflops": 1277.7
-    },
-    {
-      "name": "D_transformer_engine_fp8_linear",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 488.527,
-      "event_us_per_iter": 977.054,
-      "wall_ms_total": 488.576,
-      "tflops": 1125.3,
-      "note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
-    },
-    {
-      "index": 0,
-      "algo_id": 52,
-      "tile_id": 23,
-      "splitk": 1,
-      "stages_id": 36,
-      "inner_shape_id": 0,
-      "cluster_shape_id": 3,
-      "name": "E_direct_cublaslt_fast_accum",
-      "status": "ok",
-      "tflops": 1598.1,
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "fast_accum": 1,
-      "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
-    }
-  ]
-}
--- a/reports_gpu_Test_combined_20260524.md
+++ b/reports_gpu_Test_combined_20260524.md
@ -1,152 +0,0 @@
-# GPU_Test 合并报告
-
- **日期:** 2026-05-24
- **节点:** `aikubeworker0012 / 172.72.8.12`，`aikubeworker0016 / 172.72.8.16`
- **GPU:** NVIDIA H100 80GB HBM3 x8 / node
- **范围:** 单机单卡算力与多机多卡 NCCL 通信
- **说明:** 本报告汇总既有原始测试结果，不重新启动额外压力测试。
-
-## 总体结论
-
-| 测试项 | 结论 | 说明 |
-|---|---|---|
-| 单机 GPU 识别 | PASS | 两台机器均识别 8 张 H100 80GB HBM3 |
-| 单机单卡 FP8 硬件算力 | PASS | direct cuBLASLt FP8 GEMM 两台机器均超过 `>= 1400 TFLOPS` |
-| PyTorch `_scaled_mm` FP8 路径 | FAIL / 软件栈信号 | 约 `1170-1180 TFLOPS`，低于阈值；已定位为 PyTorch eager / `_scaled_mm` benchmark 路径偏低，不作为硬件失败依据 |
-| 多机多卡 NCCL 正确性 | PASS | return code `0`，`Wrong=0` / `Out of bounds values: 0 OK` |
-| 多机多卡 NCCL 性能 | 符合当前 4x400Gbps 网络形态 | 2x8 allreduce / alltoall 低于 PDF 8x400Gbps 阈值，但该阈值不应直接硬套到当前 4x400Gbps 环境 |
-
-## 单机单卡 / 算力测试
-
-### 机器信息
-
-| Host | GPU | Driver | CUDA | GPU 数量 |
-|---|---|---|---|---:|
-| `aikubeworker0012` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
-| `aikubeworker0016` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
-
-来源：
-
- `reports_single_gpu_aikubeworker0012.md`
- `reports_single_gpu_aikubeworker0016.md`
-
-### 原始 PyTorch 单机算力结果
-
-| Host | FP32 | TF32 | FP16 | BF16 | FP8 `_scaled_mm` | 原始 Verdict |
-|---|---:|---:|---:|---:|---:|---|
-| `aikubeworker0012` | 52.0 | 362.3 | 691.0 | 713.0 | 1148.8 | FAIL |
-| `aikubeworker0016` | 51.9 | 357.8 | 667.2 | 699.1 | 1146.2 | FAIL |
-
-原始 PyTorch 路径使用 `torch._scaled_mm` 做 FP8 GEMM。后续复查显示，该路径会受到 PyTorch eager dispatch、输出 Tensor 创建、cuBLASLt heuristic 路径、默认 `use_fast_accum=False` 等因素影响，不能直接代表 H100 FP8 Tensor Core 硬件上限。
-
-### direct cuBLASLt FP8 GEMM 交叉验证
-
-测试参数：
-
-| 参数 | 值 |
-|---|---|
-| Benchmark | direct cuBLASLt FP8 GEMM |
-| Source | `scripts/cublaslt_fp8_gemm_bench.cu` |
-| Matrix | `8192 x 8192 x 8192` |
-| A/B dtype | FP8 E4M3 |
-| Output dtype | BF16 |
-| Compute type | `CUBLAS_COMPUTE_32F` |
-| Scale type | `CUDA_R_32F` |
-| Scale A/B | `1.0` |
-| Layout | TN |
-| fast accumulation | enabled |
-| Threshold | `>= 1400 TFLOPS` |
-
-结果：
-
-| Host | Mean FP8 TFLOPS | Min | Max | Spread | Threshold | Verdict |
-|---|---:|---:|---:|---:|---:|---|
-| `aikubeworker0012` | 1608.6 | 1599.0 | 1615.6 | 1.03% | >= 1400 | PASS |
-| `aikubeworker0016` | 1613.7 | 1602.3 | 1630.3 | 1.74% | >= 1400 | PASS |
-
-单卡逐张结果：
-
-| Host | GPU0 | GPU1 | GPU2 | GPU3 | GPU4 | GPU5 | GPU6 | GPU7 |
-|---|---:|---:|---:|---:|---:|---:|---:|---:|
-| `aikubeworker0012` | 1615.6 | 1611.0 | 1599.0 | 1607.1 | 1614.0 | 1604.4 | 1608.4 | 1609.1 |
-| `aikubeworker0016` | 1602.3 | 1604.0 | 1616.9 | 1610.6 | 1620.5 | 1630.3 | 1605.1 | 1620.2 |
-
-结论：direct cuBLASLt FP8 GEMM 已通过 `>= 1400 TFLOPS` 阈值，说明两台机器的 FP8 硬件计算路径具备达标能力。PyTorch `_scaled_mm` 的 FAIL 更适合作为软件栈 benchmark 路径问题记录，而不是 GPU 硬件失败结论。
-
-来源：
-
- `reports_cublaslt_fp8_crosscheck_20260524.md`
- `reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json`
- `reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json`
-
-## 多机多卡 NCCL 测试
-
-### 测试环境
-
-| 项目 | 结果 |
-|---|---|
-| Hosts | `nccl-gpu-1(172.72.8.12)`，`nccl-gpu-2(172.72.8.16)` |
-| Topology | 2 nodes x 8 GPUs，合计 16 GPUs |
-| NCCL source | `nccl-tests-mpirun` |
-| NCCL network | IB |
-| GPU Direct RDMA | ENABLED |
-| Active HCA rails | `mlx5_0, mlx5_1, mlx5_6, mlx5_7` |
-| HCA speed | 4 条 `400 Gb/sec (4X NDR)` ACTIVE |
-
-注意：NCCL 表里的 `GB/s` 是大 B，即 Bytes/s。IB 网卡口径 `400 Gb/s` 是小 b，即 bits/s。
-
-### 2x8 全集合通信结果
-
-| Operation | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Correctness | 当前 4x400Gbps 口径 |
-|---|---:|---:|---:|---|---|
-| allreduce | 354.27 GB/s | 354.45 GB/s | >= 491.84 GB/s | PASS | 符合当前硬件形态，低于 PDF 8 rail 阈值 |
-| alltoall | 37.00 GB/s | 37.14 GB/s | >= 76.54 GB/s | PASS | 符合当前硬件形态，低于 PDF 8 rail 阈值 |
-| broadcast | 191.65 GB/s | 190.25 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
-| reducescatter | 192.75 GB/s | 192.74 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
-| allgather | 192.14 GB/s | 192.47 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
-| sendrecv | 26.98 GB/s | 26.97 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
-
-结论：2x8 全集合通信测试中，NCCL 正确性通过。allreduce 和 alltoall 低于 PDF 8x400Gbps 参考阈值，但当前机器确认参与 NCCL 的是 4 条 400Gbps rail，因此该差距不应直接判定为当前 4x400Gbps 环境不合格。
-
-来源：
-
- `reports_multinode_nccl_all_collectives_20260523_120144.md`
- `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md`
-
-### PDF Matrix allreduce / alltoall 结果
-
-AllReduce（PDF 8x400Gbps 阈值对比，仅作参考）:
-
-| Topology | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Gap | 当前解释 |
-|---|---:|---:|---:|---:|---|
-| 2 nodes x 1 GPU | 47.29 GB/s | 47.26 GB/s | >= 48.90 GB/s | -1.61 GB/s | 接近 PDF 阈值 |
-| 2 nodes x 2 GPUs | 137.16 GB/s | 137.13 GB/s | >= 136.93 GB/s | +0.23 GB/s | 达到 PDF 阈值 |
-| 2 nodes x 4 GPUs | 335.07 GB/s | 335.02 GB/s | >= 335.48 GB/s | -0.41 GB/s | 接近 PDF 阈值 |
-| 2 nodes x 8 GPUs | 353.85 GB/s | 353.85 GB/s | >= 491.84 GB/s | -137.99 GB/s | 低于 PDF 8 rail 阈值；当前为 4 rail 环境，不直接判不合格 |
-
-AllToAll（PDF 8x400Gbps 阈值对比，仅作参考）:
-
-| Topology | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Gap | 当前解释 |
-|---|---:|---:|---:|---:|---|
-| 2 nodes x 1 GPU | 24.85 GB/s | 24.90 GB/s | >= 27.25 GB/s | -2.40 GB/s | 接近 PDF 阈值 |
-| 2 nodes x 2 GPUs | 47.76 GB/s | 47.98 GB/s | >= 54.41 GB/s | -6.65 GB/s | 低于 PDF 8 rail 阈值 |
-| 2 nodes x 4 GPUs | 72.74 GB/s | 72.80 GB/s | >= 73.73 GB/s | -0.99 GB/s | 接近 PDF 阈值 |
-| 2 nodes x 8 GPUs | 36.83 GB/s | 36.85 GB/s | >= 76.54 GB/s | -39.71 GB/s | 低于 PDF 8 rail 阈值；当前为 4 rail 环境，不直接判不合格 |
-
-来源：
-
- `reports_multinode_nccl_pdf_matrix_run_20260523.md`
- `reports_multinode_nccl_pdf_matrix_20260523_113803.md`
-
-## 风险与判断
-
-1. 单机 FP8 硬件能力通过 direct cuBLASLt 验证，当前不支持将 PyTorch `_scaled_mm` FAIL 直接判定为 GPU 硬件故障。
-2. 多机 NCCL 正确性通过，性能结果应按当前 4x400Gbps rail 环境解释。
-3. 当前多机环境确认参与 NCCL 的是 4 条 400G IB rail；PDF 参考环境为 8x400G 计算管理网络，因此 2x8 阈值与当前硬件形态不等价。
-4. 2x8 allreduce 和 alltoall 低于 PDF 8 rail 阈值，建议作为“与 PDF 参考环境差异”记录，而不是作为当前 4 rail 环境不合格结论。
-
-## 建议
-
-1. 单机 FP8 验收以 direct cuBLASLt 或 Transformer Engine GEMM benchmark 为主，PyTorch `_scaled_mm` 作为软件栈参考项保留。
-2. 多机 NCCL 后续若要按 PDF 阈值验收，需要先对齐 PDF 参考环境的 8x400Gbps rail 数量、NCCL net plugin / SHARP、跨 Leaf 交换策略、ECMP / 拥塞控制配置。
-3. 对外报告建议明确区分 `GB/s` 与 `Gb/s`：NCCL bus bandwidth 是大 B，IB 端口速率是小 b。
--- a/reports_gpu_Test_formal_20260524.md
+++ b/reports_gpu_Test_formal_20260524.md
@ -1,123 +0,0 @@
-# GPU_Test 双节点测试报告
-
- **测试日期:** 2026-05-24
- **测试节点:** `aikubeworker0012 / 172.72.8.12`，`aikubeworker0016 / 172.72.8.16`
- **节点配置:** 每节点 8 张 NVIDIA H100 80GB HBM3 GPU
- **测试范围:** 单机算力、单机 8 卡通信、多机 2x8 GPU 通信
- **网络形态:** 当前参与 NCCL 的计算网络为 4 条 400Gbps IB rail
-
-## 结论摘要
-
-| 项目 | 结果摘要 |
-|---|---|
-| GPU 识别 | 两台节点均识别 8 张 H100 80GB HBM3 GPU |
-| 单机 FP8 GEMM | 两台节点 direct cuBLASLt FP8 GEMM 均超过 1600 TFLOPS |
-| 单机 8 卡 NCCL | 两台节点单机 8 卡 NCCL 集合通信均可正常完成，主要大包通信带宽稳定 |
-| 多机 2x8 NCCL | 两节点 16 GPU NCCL 正确性通过，所有测试 `Wrong=0` / return code `0` |
-| 多机网络口径 | 当前为 4x400Gbps IB rail 环境，结果按该硬件形态解释 |
-
-## 测试环境
-
-| Host | GPU | Driver | CUDA | GPU 数量 |
-|---|---|---|---|---:|
-| `aikubeworker0012` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
-| `aikubeworker0016` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
-
-## 单机算力测试
-
-### FP8 GEMM 硬件路径验证
-
-本项使用 direct cuBLASLt FP8 GEMM benchmark，绕过 PyTorch eager 调度路径，直接验证 GPU FP8 Tensor Core 与 cuBLASLt GEMM 能力。
-
-| 参数 | 配置 |
-|---|---|
-| GEMM shape | `8192 x 8192 x 8192` |
-| 输入类型 | FP8 E4M3 |
-| 输出类型 | BF16 |
-| 累加类型 | FP32 compute |
-| Layout | TN |
-| Scale | `scale_a = 1.0`，`scale_b = 1.0` |
-| fast accumulation | enabled |
-| 测试 GPU | 每节点 8 张 GPU 逐张测试 |
-
-| Host | Mean FP8 TFLOPS | Min | Max | Spread |
-|---|---:|---:|---:|---:|
-| `aikubeworker0012` | 1608.6 | 1599.0 | 1615.6 | 1.03% |
-| `aikubeworker0016` | 1613.7 | 1602.3 | 1630.3 | 1.74% |
-
-| Host | GPU0 | GPU1 | GPU2 | GPU3 | GPU4 | GPU5 | GPU6 | GPU7 |
-|---|---:|---:|---:|---:|---:|---:|---:|---:|
-| `aikubeworker0012` | 1615.6 | 1611.0 | 1599.0 | 1607.1 | 1614.0 | 1604.4 | 1608.4 | 1609.1 |
-| `aikubeworker0016` | 1602.3 | 1604.0 | 1616.9 | 1610.6 | 1620.5 | 1630.3 | 1605.1 | 1620.2 |
-
-**说明:** PyTorch `_scaled_mm` eager benchmark 结果约为 1170-1180 TFLOPS，该结果反映 PyTorch 软件路径与调度开销，不作为本报告的硬件算力结论。
-
-## 单机 8 卡 NCCL 通信测试
-
-本项在单个节点内使用 8 张 GPU 进行 NCCL 集合通信测试，结果单位为 `GB/s`，即 Bytes/s。
-
-| Operation | `aikubeworker0012` Bus BW | `aikubeworker0016` Bus BW |
-|---|---:|---:|
-| allreduce | 472.3 GB/s | 472.4 GB/s |
-| alltoall | 343.3 GB/s | 344.3 GB/s |
-| broadcast | 364.1 GB/s | 363.6 GB/s |
-| reducescatter | 352.8 GB/s | 353.1 GB/s |
-| allgather | 366.4 GB/s | 366.4 GB/s |
-| sendrecv | 369.0 GB/s | 368.9 GB/s |
-
-**说明:** 单机 8 卡通信主要依赖节点内 GPU 互联与 NCCL collective 实现。两台节点的同类 operation 结果接近，节点间差异较小。
-
-## 多机 2x8 NCCL 通信测试
-
-本项使用两台节点，每台 8 张 GPU，共 16 张 GPU 进行跨节点 NCCL 集合通信测试。
-
-### 网络环境
-
-| 项目 | 配置 |
-|---|---|
-| Host A | `aikubeworker0012 / 172.72.8.12` |
-| Host B | `aikubeworker0016 / 172.72.8.16` |
-| 拓扑 | 2 nodes x 8 GPUs |
-| NCCL network | IB |
-| GPU Direct RDMA | ENABLED |
-| Active rails | `mlx5_0, mlx5_1, mlx5_6, mlx5_7` |
-| Rail 速率 | 4 条 `400 Gb/sec (4X NDR)` ACTIVE |
-
-### 跨节点 NCCL 结果
-
-| Operation | Peak Bus BW | Avg Bus BW | Correctness |
-|---|---:|---:|---|
-| allreduce | 354.27 GB/s | 354.45 GB/s | PASS |
-| alltoall | 37.00 GB/s | 37.14 GB/s | PASS |
-| broadcast | 191.65 GB/s | 190.25 GB/s | PASS |
-| reducescatter | 192.75 GB/s | 192.74 GB/s | PASS |
-| allgather | 192.14 GB/s | 192.47 GB/s | PASS |
-| sendrecv | 26.98 GB/s | 26.97 GB/s | PASS |
-
-**正确性:** 本轮多机 NCCL 测试 return code 为 `0`，`Wrong=0`，未发现数据正确性错误。
-
-## 单位说明
-
-| 写法 | 含义 | 说明 |
-|---|---|---|
-| `GB/s` | Gigabytes per second | 大 B，字节每秒，NCCL bus bandwidth 使用此单位 |
-| `Gbps` / `Gb/s` | Gigabits per second | 小 b，比特每秒，IB 端口速率通常使用此单位 |
-
-换算关系：
-
-```text
-1 Byte = 8 bits
-400 Gb/s = 50 GB/s
-4 x 400 Gb/s = 1600 Gb/s = 200 GB/s 物理链路字节带宽
-```
-
-NCCL 的 `busbw` 是 collective 通信的逻辑折算带宽，不等同于单条物理链路的线速。
-
-## 结果说明
-
-1. 两台节点 GPU 识别正常，均为 8 张 H100 80GB HBM3。
-2. direct cuBLASLt FP8 GEMM 显示两台节点单卡 FP8 算力均超过 1600 TFLOPS，GPU FP8 硬件计算路径正常。
-3. 单机 8 卡 NCCL 通信在两台节点上结果接近，未观察到明显节点间异常差异。
-4. 多机 2x8 NCCL 正确性通过，跨节点通信功能正常。
-5. 当前多机通信结果应按 4x400Gbps IB rail 环境解释；若后续需要对齐 8x400Gbps 环境，应先确认 rail 数量、NCCL net plugin / SHARP、交换网络策略等配置一致。
-
--- a/reports_gpu_Test_pdf.css
+++ b/reports_gpu_Test_pdf.css
@ -1,102 +0,0 @@
-@page {
-  size: A4 landscape;
-  margin: 13mm;
-}
-
-body {
-  color: #111827;
-  font-family: "PingFang SC", "Heiti SC", "Arial Unicode MS", sans-serif;
-  font-size: 11px;
-  line-height: 1.45;
-}
-
-h1 {
-  color: #0f172a;
-  font-size: 24px;
-  margin: 0 0 14px;
-}
-
-h2 {
-  border-bottom: 1px solid #cbd5e1;
-  color: #0f172a;
-  font-size: 17px;
-  margin: 24px 0 10px;
-  padding-bottom: 4px;
-}
-
-h3 {
-  color: #1f2937;
-  font-size: 13px;
-  margin: 16px 0 8px;
-}
-
-p {
-  margin: 7px 0;
-}
-
-code {
-  background: #f1f5f9;
-  border-radius: 3px;
-  color: #0f172a;
-  font-family: Menlo, Consolas, monospace;
-  font-size: 10px;
-  padding: 1px 3px;
-}
-
-pre {
-  background: #f8fafc;
-  border: 1px solid #e2e8f0;
-  border-radius: 4px;
-  padding: 8px;
-  white-space: pre-wrap;
-}
-
-table {
-  border-collapse: collapse;
-  margin: 8px 0 14px;
-  page-break-inside: auto;
-  width: 100%;
-}
-
-thead {
-  display: table-header-group;
-}
-
-tr {
-  page-break-inside: avoid;
-}
-
-th,
-td {
-  border: 1px solid #cbd5e1;
-  padding: 5px 6px;
-  text-align: left;
-  vertical-align: middle;
-  word-break: break-word;
-}
-
-th {
-  background: #e2e8f0;
-  color: #0f172a;
-  font-weight: 700;
-}
-
-tbody tr:nth-child(even) td {
-  background: #f8fafc;
-}
-
-a {
-  color: #2563eb;
-  text-decoration: none;
-}
-
-ul,
-ol {
-  margin: 6px 0 10px 20px;
-  padding: 0;
-}
-
-li {
-  margin: 3px 0;
-}
-
--- a/scripts/cublaslt_fp8_gemm_bench.cu
+++ b/scripts/cublaslt_fp8_gemm_bench.cu
@ -1,291 +0,0 @@
-#include <cublasLt.h>
-#include <cuda_bf16.h>
-#include <cuda_fp8.h>
-#include <cuda_runtime.h>
-
-#include <algorithm>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <numeric>
-#include <string>
-#include <vector>
-
-#define CHECK_CUDA(call)                                                       \
-  do {                                                                         \
-    cudaError_t status = (call);                                               \
-    if (status != cudaSuccess) {                                               \
-      std::fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__,       \
-                   cudaGetErrorString(status));                                \
-      std::exit(1);                                                            \
-    }                                                                          \
-  } while (0)
-
-#define CHECK_CUBLAS(call)                                                     \
-  do {                                                                         \
-    cublasStatus_t status = (call);                                            \
-    if (status != CUBLAS_STATUS_SUCCESS) {                                     \
-      std::fprintf(stderr, "cuBLASLt error %s:%d: status=%d\n", __FILE__,      \
-                   __LINE__, static_cast<int>(status));                        \
-      std::exit(1);                                                            \
-    }                                                                          \
-  } while (0)
-
-__global__ void fill_fp8(__nv_fp8_e4m3 *ptr, size_t count, float value) {
-  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-  size_t stride = blockDim.x * gridDim.x;
-  for (size_t i = tid; i < count; i += stride) {
-    ptr[i] = __nv_fp8_e4m3(value);
-  }
-}
-
-struct Args {
-  int matrix_size = 8192;
-  int warmup = 20;
-  int iterations = 200;
-  int first_gpu = 0;
-  int gpu_count = -1;
-  size_t workspace_mb = 256;
-  int fast_accum = 1;
-};
-
-static Args parse_args(int argc, char **argv) {
-  Args args;
-  for (int i = 1; i < argc; ++i) {
-    auto need = [&](const char *name) {
-      if (i + 1 >= argc) {
-        std::fprintf(stderr, "Missing value for %s\n", name);
-        std::exit(2);
-      }
-      return argv[++i];
-    };
-    if (!std::strcmp(argv[i], "--matrix-size")) {
-      args.matrix_size = std::atoi(need(argv[i]));
-    } else if (!std::strcmp(argv[i], "--warmup")) {
-      args.warmup = std::atoi(need(argv[i]));
-    } else if (!std::strcmp(argv[i], "--iterations")) {
-      args.iterations = std::atoi(need(argv[i]));
-    } else if (!std::strcmp(argv[i], "--first-gpu")) {
-      args.first_gpu = std::atoi(need(argv[i]));
-    } else if (!std::strcmp(argv[i], "--gpu-count")) {
-      args.gpu_count = std::atoi(need(argv[i]));
-    } else if (!std::strcmp(argv[i], "--workspace-mb")) {
-      args.workspace_mb = static_cast<size_t>(std::atoll(need(argv[i])));
-    } else if (!std::strcmp(argv[i], "--fast-accum")) {
-      args.fast_accum = std::atoi(need(argv[i]));
-    } else if (!std::strcmp(argv[i], "--help") || !std::strcmp(argv[i], "-h")) {
-      std::puts("Usage: cublaslt_fp8_gemm_bench [--matrix-size N] [--warmup N] "
-                "[--iterations N] [--first-gpu N] [--gpu-count N] "
-                "[--workspace-mb N] [--fast-accum 0|1]");
-      std::exit(0);
-    } else {
-      std::fprintf(stderr, "Unknown argument: %s\n", argv[i]);
-      std::exit(2);
-    }
-  }
-  return args;
-}
-
-static double run_one_gpu(int gpu, const Args &args) {
-  CHECK_CUDA(cudaSetDevice(gpu));
-
-  const int64_t m = args.matrix_size;
-  const int64_t n = args.matrix_size;
-  const int64_t k = args.matrix_size;
-  const size_t a_elems = static_cast<size_t>(m) * k;
-  const size_t b_elems = static_cast<size_t>(k) * n;
-  const size_t d_elems = static_cast<size_t>(m) * n;
-
-  __nv_fp8_e4m3 *d_a = nullptr;
-  __nv_fp8_e4m3 *d_b = nullptr;
-  __nv_bfloat16 *d_d = nullptr;
-  void *workspace = nullptr;
-  float *d_scale_a = nullptr;
-  float *d_scale_b = nullptr;
-  const float scale = 1.0f;
-  const size_t workspace_bytes = args.workspace_mb * 1024ULL * 1024ULL;
-
-  CHECK_CUDA(cudaMalloc(&d_a, a_elems * sizeof(__nv_fp8_e4m3)));
-  CHECK_CUDA(cudaMalloc(&d_b, b_elems * sizeof(__nv_fp8_e4m3)));
-  CHECK_CUDA(cudaMalloc(&d_d, d_elems * sizeof(__nv_bfloat16)));
-  CHECK_CUDA(cudaMalloc(&workspace, workspace_bytes));
-  CHECK_CUDA(cudaMalloc(&d_scale_a, sizeof(float)));
-  CHECK_CUDA(cudaMalloc(&d_scale_b, sizeof(float)));
-  CHECK_CUDA(cudaMemcpy(d_scale_a, &scale, sizeof(scale), cudaMemcpyHostToDevice));
-  CHECK_CUDA(cudaMemcpy(d_scale_b, &scale, sizeof(scale), cudaMemcpyHostToDevice));
-
-  const int threads = 256;
-  const int blocks = 4096;
-  fill_fp8<<<blocks, threads>>>(d_a, a_elems, 0.01f);
-  fill_fp8<<<blocks, threads>>>(d_b, b_elems, 0.01f);
-  CHECK_CUDA(cudaMemset(d_d, 0, d_elems * sizeof(__nv_bfloat16)));
-  CHECK_CUDA(cudaGetLastError());
-  CHECK_CUDA(cudaDeviceSynchronize());
-
-  cublasLtHandle_t lt;
-  cublasLtMatmulDesc_t op_desc;
-  cublasLtMatrixLayout_t a_desc, b_desc, d_desc;
-  cublasLtMatmulPreference_t preference;
-  CHECK_CUBLAS(cublasLtCreate(&lt));
-  CHECK_CUBLAS(cublasLtMatmulDescCreate(&op_desc, CUBLAS_COMPUTE_32F, CUDA_R_32F));
-
-  // cuBLASLt FP8 kernels require TN format: A is transposed, B is non-transposed.
-  // With square GEMMs this keeps the benchmark FLOP count identical to the PDF
-  // acceptance shape while satisfying the library's FP8 kernel constraints.
-  cublasOperation_t transa = CUBLAS_OP_T;
-  cublasOperation_t transb = CUBLAS_OP_N;
-  CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
-      op_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)));
-  CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
-      op_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb)));
-  CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
-      op_desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &d_scale_a,
-      sizeof(d_scale_a)));
-  CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
-      op_desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &d_scale_b,
-      sizeof(d_scale_b)));
-  int8_t fast_accum = args.fast_accum ? 1 : 0;
-  CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
-      op_desc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fast_accum,
-      sizeof(fast_accum)));
-
-  CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&a_desc, CUDA_R_8F_E4M3, k, m, k));
-  CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&b_desc, CUDA_R_8F_E4M3, k, n, k));
-  CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&d_desc, CUDA_R_16BF, m, n, m));
-
-  CHECK_CUBLAS(cublasLtMatmulPreferenceCreate(&preference));
-  CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute(
-      preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_bytes,
-      sizeof(workspace_bytes)));
-
-  cublasLtMatmulHeuristicResult_t heuristic;
-  int returned = 0;
-  CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic(
-      lt, op_desc, a_desc, b_desc, d_desc, d_desc, preference, 1, &heuristic,
-      &returned));
-  if (returned == 0) {
-    std::fprintf(stderr, "No cuBLASLt heuristic returned for GPU %d\n", gpu);
-    std::exit(1);
-  }
-
-  auto get_algo_attr_i32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
-    int32_t value = -1;
-    size_t written = 0;
-    CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
-        &heuristic.algo, attr, &value, sizeof(value), &written));
-    return static_cast<int>(value);
-  };
-  auto get_algo_attr_u32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
-    uint32_t value = 0;
-    size_t written = 0;
-    CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
-        &heuristic.algo, attr, &value, sizeof(value), &written));
-    return static_cast<int>(value);
-  };
-  auto get_algo_attr_u16 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
-    uint16_t value = 0;
-    size_t written = 0;
-    CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
-        &heuristic.algo, attr, &value, sizeof(value), &written));
-    return static_cast<int>(value);
-  };
-  const int algo_id = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_ID);
-  const int tile_id = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_TILE_ID);
-  const int splitk = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_SPLITK_NUM);
-  const int stages = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_STAGES_ID);
-  const int inner_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID);
-  const int cluster_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID);
-
-  const float alpha = 1.0f;
-  const float beta = 0.0f;
-  auto matmul = [&]() {
-    CHECK_CUBLAS(cublasLtMatmul(lt, op_desc, &alpha, d_a, a_desc, d_b, b_desc,
-                                &beta, d_d, d_desc, d_d, d_desc,
-                                &heuristic.algo, workspace, workspace_bytes, 0));
-  };
-
-  for (int i = 0; i < args.warmup; ++i) {
-    matmul();
-  }
-  CHECK_CUDA(cudaDeviceSynchronize());
-
-  cudaEvent_t start, stop;
-  CHECK_CUDA(cudaEventCreate(&start));
-  CHECK_CUDA(cudaEventCreate(&stop));
-  CHECK_CUDA(cudaEventRecord(start));
-  for (int i = 0; i < args.iterations; ++i) {
-    matmul();
-  }
-  CHECK_CUDA(cudaEventRecord(stop));
-  CHECK_CUDA(cudaEventSynchronize(stop));
-  float elapsed_ms = 0.0f;
-  CHECK_CUDA(cudaEventElapsedTime(&elapsed_ms, start, stop));
-  const double flops =
-      2.0 * static_cast<double>(m) * static_cast<double>(n) *
-      static_cast<double>(k) * static_cast<double>(args.iterations);
-  const double tflops = flops / (static_cast<double>(elapsed_ms) / 1000.0) / 1e12;
-  std::printf(
-      "    {\"index\": %d, \"fp8_tflops\": %.1f, \"algo_id\": %d, "
-      "\"tile_id\": %d, \"splitk\": %d, \"stages_id\": %d, "
-      "\"inner_shape_id\": %d, \"cluster_shape_id\": %d}%s\n",
-      gpu, tflops, algo_id, tile_id, splitk, stages, inner_shape, cluster_shape,
-      (gpu + 1 == args.first_gpu + args.gpu_count) ? "" : ",");
-  std::fflush(stdout);
-
-  CHECK_CUDA(cudaEventDestroy(start));
-  CHECK_CUDA(cudaEventDestroy(stop));
-  CHECK_CUBLAS(cublasLtMatmulPreferenceDestroy(preference));
-  CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(a_desc));
-  CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(b_desc));
-  CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(d_desc));
-  CHECK_CUBLAS(cublasLtMatmulDescDestroy(op_desc));
-  CHECK_CUBLAS(cublasLtDestroy(lt));
-  CHECK_CUDA(cudaFree(d_a));
-  CHECK_CUDA(cudaFree(d_b));
-  CHECK_CUDA(cudaFree(d_d));
-  CHECK_CUDA(cudaFree(workspace));
-  CHECK_CUDA(cudaFree(d_scale_a));
-  CHECK_CUDA(cudaFree(d_scale_b));
-  CHECK_CUDA(cudaDeviceSynchronize());
-
-  return tflops;
-}
-
-int main(int argc, char **argv) {
-  Args args = parse_args(argc, argv);
-  int device_count = 0;
-  CHECK_CUDA(cudaGetDeviceCount(&device_count));
-  if (args.gpu_count < 0) {
-    args.gpu_count = device_count - args.first_gpu;
-  }
-  if (args.first_gpu < 0 || args.first_gpu + args.gpu_count > device_count) {
-    std::fprintf(stderr, "Invalid GPU range first=%d count=%d device_count=%d\n",
-                 args.first_gpu, args.gpu_count, device_count);
-    return 2;
-  }
-
-  std::vector<double> values;
-  std::printf("{\n");
-  std::printf("  \"source\": \"cuBLASLt\",\n");
-  std::printf("  \"dtype\": \"fp8_e4m3_inputs_bf16_output_fp32_accum\",\n");
-  std::printf("  \"matrix_size\": %d,\n", args.matrix_size);
-  std::printf("  \"warmup\": %d,\n", args.warmup);
-  std::printf("  \"iterations\": %d,\n", args.iterations);
-  std::printf("  \"fast_accum\": %d,\n", args.fast_accum ? 1 : 0);
-  std::printf("  \"per_gpu\": [\n");
-  for (int i = 0; i < args.gpu_count; ++i) {
-    int gpu = args.first_gpu + i;
-    double tflops = run_one_gpu(gpu, args);
-    values.push_back(tflops);
-  }
-  double mean = std::accumulate(values.begin(), values.end(), 0.0) / values.size();
-  auto minmax = std::minmax_element(values.begin(), values.end());
-  double spread = ((*minmax.second - *minmax.first) / mean) * 100.0;
-  std::printf("  ],\n");
-  std::printf("  \"mean_tflops\": %.1f,\n", mean);
-  std::printf("  \"min_tflops\": %.1f,\n", *minmax.first);
-  std::printf("  \"max_tflops\": %.1f,\n", *minmax.second);
-  std::printf("  \"spread_pct\": %.2f\n", spread);
-  std::printf("}\n");
-  return mean >= 1400.0 ? 0 : 1;
-}
--- a/scripts/pytorch_fp8_path_bench.py
+++ b/scripts/pytorch_fp8_path_bench.py
@ -1,277 +0,0 @@
-#!/usr/bin/env python3
-"""Compare FP8 GEMM paths used for H100/H200 acceptance debugging.
-
-Paths:
-  A. torch._scaled_mm eager, default accumulation
-  B. torch._scaled_mm eager, use_fast_accum=True
-  C. CUDA Graph replay of torch._scaled_mm(out=..., use_fast_accum=True)
-  D. Transformer Engine Linear under fp8_autocast, when installed
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import statistics
-import sys
-import time
-from typing import Any, Callable
-
-import torch
-
-
-def tflops_from_ms(matrix_size: int, iterations: int, elapsed_ms: float) -> float:
-    flops = 2.0 * matrix_size * matrix_size * matrix_size * iterations
-    return flops / (elapsed_ms / 1000.0) / 1e12
-
-
-def cuda_event_bench(
-    name: str,
-    matrix_size: int,
-    iterations: int,
-    warmup: int,
-    func: Callable[[int], Any],
-) -> dict[str, Any]:
-    for i in range(warmup):
-        func(i)
-    torch.cuda.synchronize()
-
-    start = torch.cuda.Event(enable_timing=True)
-    end = torch.cuda.Event(enable_timing=True)
-    wall_start = time.perf_counter()
-    start.record()
-    for i in range(iterations):
-        func(i)
-    end.record()
-    torch.cuda.synchronize()
-    wall_elapsed = time.perf_counter() - wall_start
-    elapsed_ms = start.elapsed_time(end)
-    return {
-        "name": name,
-        "status": "ok",
-        "matrix_size": matrix_size,
-        "iterations": iterations,
-        "warmup": warmup,
-        "event_ms_total": round(elapsed_ms, 3),
-        "event_us_per_iter": round(elapsed_ms * 1000.0 / iterations, 3),
-        "wall_ms_total": round(wall_elapsed * 1000.0, 3),
-        "tflops": round(tflops_from_ms(matrix_size, iterations, elapsed_ms), 1),
-    }
-
-
-def make_fp8_inputs(matrix_size: int, pools: int, device: str) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
-    a = [
-        torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn)
-        for _ in range(pools)
-    ]
-    b = [
-        torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn)
-        for _ in range(pools)
-    ]
-    torch.cuda.synchronize()
-    return a, b
-
-
-def bench_scaled_mm(args: argparse.Namespace) -> list[dict[str, Any]]:
-    device = f"cuda:{args.gpu_index}"
-    torch.cuda.set_device(args.gpu_index)
-    scale_a = torch.tensor(1.0, device=device)
-    scale_b = torch.tensor(1.0, device=device)
-    pools_a, pools_b = make_fp8_inputs(args.matrix_size, args.pools, device)
-    results: list[dict[str, Any]] = []
-
-    def eager_default(i: int) -> torch.Tensor:
-        idx = i % args.pools
-        return torch._scaled_mm(
-            pools_a[idx],
-            pools_b[idx].T,
-            scale_a=scale_a,
-            scale_b=scale_b,
-            out_dtype=torch.bfloat16,
-        )
-
-    def eager_fast(i: int) -> torch.Tensor:
-        idx = i % args.pools
-        return torch._scaled_mm(
-            pools_a[idx],
-            pools_b[idx].T,
-            scale_a=scale_a,
-            scale_b=scale_b,
-            out_dtype=torch.bfloat16,
-            use_fast_accum=True,
-        )
-
-    results.append(
-        cuda_event_bench(
-            "A_eager_scaled_mm_default",
-            args.matrix_size,
-            args.iterations,
-            args.warmup,
-            eager_default,
-        )
-    )
-    results.append(
-        cuda_event_bench(
-            "B_eager_scaled_mm_fast_accum",
-            args.matrix_size,
-            args.iterations,
-            args.warmup,
-            eager_fast,
-        )
-    )
-
-    graph_out = torch.empty(
-        (args.matrix_size, args.matrix_size),
-        device=device,
-        dtype=torch.bfloat16,
-    )
-    static_a = pools_a[0]
-    static_b_t = pools_b[0].T
-
-    try:
-        side_stream = torch.cuda.Stream()
-        side_stream.wait_stream(torch.cuda.current_stream())
-        with torch.cuda.stream(side_stream):
-            for _ in range(max(3, args.warmup // 2)):
-                torch._scaled_mm(
-                    static_a,
-                    static_b_t,
-                    scale_a=scale_a,
-                    scale_b=scale_b,
-                    out_dtype=torch.bfloat16,
-                    use_fast_accum=True,
-                    out=graph_out,
-                )
-        torch.cuda.current_stream().wait_stream(side_stream)
-        torch.cuda.synchronize()
-
-        graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(graph):
-            torch._scaled_mm(
-                static_a,
-                static_b_t,
-                scale_a=scale_a,
-                scale_b=scale_b,
-                out_dtype=torch.bfloat16,
-                use_fast_accum=True,
-                out=graph_out,
-            )
-
-        def graph_replay(_: int) -> None:
-            graph.replay()
-
-        results.append(
-            cuda_event_bench(
-                "C_cuda_graph_scaled_mm_fast_accum",
-                args.matrix_size,
-                args.iterations,
-                3,
-                graph_replay,
-            )
-        )
-    except Exception as exc:  # noqa: BLE001
-        results.append(
-            {
-                "name": "C_cuda_graph_scaled_mm_fast_accum",
-                "status": "unavailable",
-                "reason": f"{type(exc).__name__}: {exc}",
-            }
-        )
-
-    return results
-
-
-def bench_transformer_engine(args: argparse.Namespace) -> dict[str, Any]:
-    try:
-        import transformer_engine.pytorch as te  # type: ignore[import-not-found]
-        from transformer_engine.common.recipe import DelayedScaling, Format  # type: ignore[import-not-found]
-    except Exception as exc:  # noqa: BLE001
-        return {
-            "name": "D_transformer_engine_fp8_linear",
-            "status": "unavailable",
-            "reason": f"{type(exc).__name__}: {exc}",
-        }
-
-    device = f"cuda:{args.gpu_index}"
-    x = torch.randn(args.matrix_size, args.matrix_size, device=device, dtype=torch.bfloat16)
-    layer = te.Linear(
-        args.matrix_size,
-        args.matrix_size,
-        bias=False,
-        params_dtype=torch.bfloat16,
-        device=device,
-    )
-    recipe = DelayedScaling(fp8_format=Format.HYBRID)
-
-    def run(_: int) -> torch.Tensor:
-        with te.fp8_autocast(enabled=True, fp8_recipe=recipe):
-            return layer(x)
-
-    try:
-        result = cuda_event_bench(
-            "D_transformer_engine_fp8_linear",
-            args.matrix_size,
-            args.iterations,
-            args.warmup,
-            run,
-        )
-    except Exception as exc:  # noqa: BLE001
-        return {
-            "name": "D_transformer_engine_fp8_linear",
-            "status": "error",
-            "reason": f"{type(exc).__name__}: {exc}",
-        }
-    result["note"] = "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
-    return result
-
-
-def main() -> int:
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--matrix-size", type=int, default=8192)
-    parser.add_argument("--warmup", type=int, default=20)
-    parser.add_argument("--iterations", type=int, default=100)
-    parser.add_argument("--gpu-index", type=int, default=0)
-    parser.add_argument("--pools", type=int, default=4)
-    args = parser.parse_args()
-
-    if not torch.cuda.is_available():
-        print(json.dumps({"error": "cuda unavailable"}, indent=2))
-        return 1
-    if not hasattr(torch, "_scaled_mm") or not hasattr(torch, "float8_e4m3fn"):
-        print(json.dumps({"error": "torch FP8 _scaled_mm unavailable"}, indent=2))
-        return 1
-
-    torch.cuda.set_device(args.gpu_index)
-    props = torch.cuda.get_device_properties(args.gpu_index)
-    payload = {
-        "source": "pytorch_fp8_path_bench",
-        "torch": torch.__version__,
-        "cuda": torch.version.cuda,
-        "gpu_index": args.gpu_index,
-        "gpu_name": props.name,
-        "matrix_size": args.matrix_size,
-        "warmup": args.warmup,
-        "iterations": args.iterations,
-        "results": [],
-    }
-    try:
-        payload["results"].extend(bench_scaled_mm(args))
-        payload["results"].append(bench_transformer_engine(args))
-    except torch.cuda.OutOfMemoryError as exc:
-        payload["error"] = f"CUDA OOM: {exc}"
-        print(json.dumps(payload, indent=2))
-        return 1
-
-    ok_values = [r["tflops"] for r in payload["results"] if r.get("status") == "ok"]
-    if ok_values:
-        payload["summary"] = {
-            "max_tflops": round(max(ok_values), 1),
-            "min_tflops": round(min(ok_values), 1),
-            "mean_tflops": round(statistics.mean(ok_values), 1),
-        }
-    print(json.dumps(payload, indent=2))
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
--- a/scripts/run_cublaslt_fp8_gemm.sh
+++ b/scripts/run_cublaslt_fp8_gemm.sh
@ -1,45 +0,0 @@
-#!/usr/bin/env bash
-set -uo pipefail
-
-SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
-PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
-
-CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
-NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
-OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
-MATRIX_SIZE="${MATRIX_SIZE:-8192}"
-WARMUP="${WARMUP:-20}"
-ITERATIONS="${ITERATIONS:-200}"
-GPU_COUNT="${GPU_COUNT:-8}"
-FIRST_GPU="${FIRST_GPU:-0}"
-WORKSPACE_MB="${WORKSPACE_MB:-256}"
-
-if [[ ! -x "$NVCC" ]]; then
-  echo "nvcc not found: $NVCC" >&2
-  exit 1
-fi
-
-mkdir -p "$OUT_DIR" "$PROJECT_DIR/build"
-HOST="$(hostname 2>/dev/null || echo unknown)"
-TS="$(date +%Y%m%d_%H%M%S)"
-BIN="$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
-REPORT="$OUT_DIR/cublaslt_fp8_gemm_${HOST}_${TS}.json"
-
-"$NVCC" -O3 -std=c++17 -arch=sm_90 \
-  "$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
-  -lcublasLt -lcublas -o "$BIN"
-
-set +e
-"$BIN" \
-  --matrix-size "$MATRIX_SIZE" \
-  --warmup "$WARMUP" \
-  --iterations "$ITERATIONS" \
-  --first-gpu "$FIRST_GPU" \
-  --gpu-count "$GPU_COUNT" \
-  --workspace-mb "$WORKSPACE_MB" \
-  | tee "$REPORT"
-status=${PIPESTATUS[0]}
-set -e
-
-echo "Report written to: $REPORT"
-exit "$status"
--- a/scripts/run_fp8_path_comparison.sh
+++ b/scripts/run_fp8_path_comparison.sh
@ -1,93 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
-PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
-
-PYTHON="${PYTHON:-/root/gpu-test-venv/bin/python}"
-CUDA_HOME="${CUDA_HOME:-/usr/local/cuda-12.4}"
-NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
-OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
-MATRIX_SIZE="${MATRIX_SIZE:-8192}"
-WARMUP="${WARMUP:-20}"
-ITERATIONS="${ITERATIONS:-100}"
-GPU_INDEX="${GPU_INDEX:-0}"
-WORKSPACE_MB="${WORKSPACE_MB:-256}"
-VENV_SITE_PACKAGES="$("$PYTHON" - <<'PY'
-import site
-print(site.getsitepackages()[0])
-PY
-)"
-export LD_LIBRARY_PATH="$VENV_SITE_PACKAGES/nvidia/cudnn/lib:$VENV_SITE_PACKAGES/nvidia/nccl/lib:${LD_LIBRARY_PATH:-}"
-
-mkdir -p "$PROJECT_DIR/build" "$OUT_DIR"
-
-HOST="$(hostname 2>/dev/null || echo unknown)"
-TS="$(date +%Y%m%d_%H%M%S)"
-PY_REPORT="$OUT_DIR/fp8_paths_pytorch_${HOST}_${TS}.json"
-CUBLAS_REPORT="$OUT_DIR/fp8_paths_cublaslt_${HOST}_${TS}.json"
-COMBINED_REPORT="$OUT_DIR/fp8_paths_combined_${HOST}_${TS}.json"
-
-"$PYTHON" "$PROJECT_DIR/scripts/pytorch_fp8_path_bench.py" \
-  --matrix-size "$MATRIX_SIZE" \
-  --warmup "$WARMUP" \
-  --iterations "$ITERATIONS" \
-  --gpu-index "$GPU_INDEX" | tee "$PY_REPORT"
-
-"$NVCC" -O3 -std=c++17 -arch=sm_90 \
-  "$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
-  -lcublasLt -lcublas -o "$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
-
-"$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" \
-  --matrix-size "$MATRIX_SIZE" \
-  --warmup "$WARMUP" \
-  --iterations "$ITERATIONS" \
-  --first-gpu "$GPU_INDEX" \
-  --gpu-count 1 \
-  --workspace-mb "$WORKSPACE_MB" \
-  --fast-accum 1 | tee "$CUBLAS_REPORT"
-
-"$PYTHON" - "$PY_REPORT" "$CUBLAS_REPORT" "$COMBINED_REPORT" <<'PY'
-import json
-import pathlib
-import sys
-
-py_report = pathlib.Path(sys.argv[1])
-cublas_report = pathlib.Path(sys.argv[2])
-combined_report = pathlib.Path(sys.argv[3])
-
-with py_report.open() as f:
-    py_payload = json.load(f)
-with cublas_report.open() as f:
-    cublas_payload = json.load(f)
-
-combined = {
-    "source": "fp8_path_comparison",
-    "host": cublas_payload.get("host"),
-    "matrix_size": py_payload.get("matrix_size"),
-    "gpu_index": py_payload.get("gpu_index"),
-    "pytorch": py_payload,
-    "cublaslt": cublas_payload,
-    "results": [],
-}
-combined["results"].extend(py_payload.get("results", []))
-per_gpu = cublas_payload.get("per_gpu", [])
-if per_gpu:
-    row = dict(per_gpu[0])
-    row.update({
-        "name": "E_direct_cublaslt_fast_accum",
-        "status": "ok",
-        "tflops": row.pop("fp8_tflops"),
-        "matrix_size": cublas_payload.get("matrix_size"),
-        "iterations": cublas_payload.get("iterations"),
-        "warmup": cublas_payload.get("warmup"),
-        "fast_accum": cublas_payload.get("fast_accum"),
-        "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager.",
-    })
-    combined["results"].append(row)
-
-combined_report.write_text(json.dumps(combined, indent=2), encoding="utf-8")
-print(f"Combined report written to: {combined_report}")
-PY
-
-echo "$COMBINED_REPORT"
--- a/doc/调研.md
+++ b/doc/调研.md
Author	SHA1	Message	Date
cs	211140e7f1	Add H100 acceptance PR summary	2026-05-23 20:37:19 +08:00
cs	9b0e6e29df	Add H100 acceptance delivery manifest	2026-05-23 20:34:01 +08:00
cs	cfca4df62a	Add H100 network escalation request	2026-05-23 20:29:19 +08:00
cs	6d72c67468	Add H100 acceptance closure checklist	2026-05-23 20:25:39 +08:00
cs	64f203a9be	Document H100 acceptance entrypoint	2026-05-23 20:22:15 +08:00
cs	c48f9f915d	Summarize current H100 acceptance status	2026-05-23 20:15:01 +08:00
cs	0a772103f0	Archive all-collectives NCCL artifacts	2026-05-23 20:11:22 +08:00
cs	f5699bf85a	Add multinode NCCL all collectives run	2026-05-23 20:07:47 +08:00
cs	59621d26a0	Analyze multinode NCCL artifact signals	2026-05-23 19:50:51 +08:00
cs	18cebd8e06	Record multinode NCCL artifacts run	2026-05-23 19:45:03 +08:00
cs	1a8cf6cbbb	Archive multinode NCCL raw artifacts	2026-05-23 19:36:53 +08:00
cs	63c32fd75d	Clarify multinode NCCL report thresholds	2026-05-23 19:33:01 +08:00
cs	4f38b3a2a0	Record multinode NCCL PDF matrix run	2026-05-23 19:30:14 +08:00
cs	8ff5021385	Add multinode NCCL PDF matrix runner	2026-05-23 19:21:58 +08:00
cs	bdffd7e616	Add single-node H100 all runner	2026-05-23 19:16:40 +08:00
cs	b6b1ccc2dc	Add NCCL environment snapshot script	2026-05-23 19:13:35 +08:00
cs	ec6b868d3f	Add NCCL latest report index	2026-05-23 18:59:45 +08:00
cs	3a0e739991	Add NCCL network handoff plan	2026-05-23 18:57:22 +08:00
cs	b914cb7b4b	Document NCCL environment equivalence gaps	2026-05-23 18:54:35 +08:00
cs	adcdb36e05	Document NCCL deep diagnosis rerun	2026-05-23 18:51:41 +08:00
cs	0d63ea5e05	Add multinode NCCL deep diagnosis tools	2026-05-23 18:49:52 +08:00
cs	08e0d93a16	Document NCCL graph comparison	2026-05-23 17:32:03 +08:00
cs	9a32645c9d	Document NCCL alltoall secondary sweep	2026-05-23 17:28:28 +08:00
cs	71ac97a24e	Compare NCCL allreduce alltoall counters	2026-05-23 17:17:22 +08:00
cs	890e623be4	Document NCCL alltoall counter probe	2026-05-23 17:13:03 +08:00
cs	17441a4583	Document PXN alltoall rail balancing	2026-05-23 17:03:02 +08:00
cs	05294a66d8	Tune multinode alltoall PXN behavior	2026-05-23 17:00:03 +08:00
cs	6b302c8b09	Add raw RDMA rail bandwidth evidence	2026-05-23 16:46:15 +08:00
cs	cd1728a7c3	Document missing NCCL network plugin	2026-05-23 16:43:25 +08:00
cs	48a27472e2	Document NCCL rail saturation evidence	2026-05-23 16:42:27 +08:00
cs	49d358c0ca	Add NCCL PDF matrix topology report	2026-05-23 16:35:24 +08:00
cs	519de86553	Tune multinode NCCL auto parameters	2026-05-23 16:12:32 +08:00
cs	3268f4bd2a	Validate NCCL 2.27 multinode GDR performance	2026-05-23 15:58:21 +08:00
cs	b82ac54218	Stabilize multinode NCCL launch diagnostics	2026-05-23 15:49:14 +08:00
cs	28fe55d5c7	Add multinode NCCL diagnostic report	2026-05-23 15:39:15 +08:00
cs	ac91f1aeb5	Add multi-node NCCL sweep test	2026-05-23 13:03:26 +08:00
cs	2a51be1ba3	Add H100 acceptance test coverage and reports	2026-05-23 10:41:09 +08:00