Compare commits

..

37 Commits

Author SHA1 Message Date
cs
211140e7f1 Add H100 acceptance PR summary 2026-05-23 20:37:19 +08:00
cs
9b0e6e29df Add H100 acceptance delivery manifest 2026-05-23 20:34:01 +08:00
cs
cfca4df62a Add H100 network escalation request 2026-05-23 20:29:19 +08:00
cs
6d72c67468 Add H100 acceptance closure checklist 2026-05-23 20:25:39 +08:00
cs
64f203a9be Document H100 acceptance entrypoint 2026-05-23 20:22:15 +08:00
cs
c48f9f915d Summarize current H100 acceptance status 2026-05-23 20:15:01 +08:00
cs
0a772103f0 Archive all-collectives NCCL artifacts 2026-05-23 20:11:22 +08:00
cs
f5699bf85a Add multinode NCCL all collectives run 2026-05-23 20:07:47 +08:00
cs
59621d26a0 Analyze multinode NCCL artifact signals 2026-05-23 19:50:51 +08:00
cs
18cebd8e06 Record multinode NCCL artifacts run 2026-05-23 19:45:03 +08:00
cs
1a8cf6cbbb Archive multinode NCCL raw artifacts 2026-05-23 19:36:53 +08:00
cs
63c32fd75d Clarify multinode NCCL report thresholds 2026-05-23 19:33:01 +08:00
cs
4f38b3a2a0 Record multinode NCCL PDF matrix run 2026-05-23 19:30:14 +08:00
cs
8ff5021385 Add multinode NCCL PDF matrix runner 2026-05-23 19:21:58 +08:00
cs
bdffd7e616 Add single-node H100 all runner 2026-05-23 19:16:40 +08:00
cs
b6b1ccc2dc Add NCCL environment snapshot script 2026-05-23 19:13:35 +08:00
cs
ec6b868d3f Add NCCL latest report index 2026-05-23 18:59:45 +08:00
cs
3a0e739991 Add NCCL network handoff plan 2026-05-23 18:57:22 +08:00
cs
b914cb7b4b Document NCCL environment equivalence gaps 2026-05-23 18:54:35 +08:00
cs
adcdb36e05 Document NCCL deep diagnosis rerun 2026-05-23 18:51:41 +08:00
cs
0d63ea5e05 Add multinode NCCL deep diagnosis tools 2026-05-23 18:49:52 +08:00
cs
08e0d93a16 Document NCCL graph comparison 2026-05-23 17:32:03 +08:00
cs
9a32645c9d Document NCCL alltoall secondary sweep 2026-05-23 17:28:28 +08:00
cs
71ac97a24e Compare NCCL allreduce alltoall counters 2026-05-23 17:17:22 +08:00
cs
890e623be4 Document NCCL alltoall counter probe 2026-05-23 17:13:03 +08:00
cs
17441a4583 Document PXN alltoall rail balancing 2026-05-23 17:03:02 +08:00
cs
05294a66d8 Tune multinode alltoall PXN behavior 2026-05-23 17:00:03 +08:00
cs
6b302c8b09 Add raw RDMA rail bandwidth evidence 2026-05-23 16:46:15 +08:00
cs
cd1728a7c3 Document missing NCCL network plugin 2026-05-23 16:43:25 +08:00
cs
48a27472e2 Document NCCL rail saturation evidence 2026-05-23 16:42:27 +08:00
cs
49d358c0ca Add NCCL PDF matrix topology report 2026-05-23 16:35:24 +08:00
cs
519de86553 Tune multinode NCCL auto parameters 2026-05-23 16:12:32 +08:00
cs
3268f4bd2a Validate NCCL 2.27 multinode GDR performance 2026-05-23 15:58:21 +08:00
cs
b82ac54218 Stabilize multinode NCCL launch diagnostics 2026-05-23 15:49:14 +08:00
cs
28fe55d5c7 Add multinode NCCL diagnostic report 2026-05-23 15:39:15 +08:00
cs
ac91f1aeb5 Add multi-node NCCL sweep test 2026-05-23 13:03:26 +08:00
cs
2a51be1ba3 Add H100 acceptance test coverage and reports 2026-05-23 10:41:09 +08:00
20 changed files with 557 additions and 2517 deletions

View File

@ -1,4 +1,4 @@
# GPU type: auto-detect or override to a100/a800/h100/h800/h200/h20/b200/b300
# GPU type: auto-detect or override to a100/a800/h100/h200/b200/b300
gpu_type: auto
benchmark:
@ -14,25 +14,12 @@ benchmark:
- fp16
- bf16
- fp8
# MAMF-style shape sweep: measure each dtype at every shape below and keep the max
# TFLOPS (the realistic achievable peak). A single fixed shape under-reports by
# ~7-12% and can't meet the MAMF-calibrated thresholds in gpu_specs.py.
# Each entry is either N (square N×N×N) or [M, N, K]. K-heavy non-square shapes
# (e.g. 2048×2048×13312) hit the true Hopper MAMF — bf16 ~790 vs ~755 square.
# Empty list => single matrix_size shape (legacy behaviour).
sweep_sizes:
- 3584
- 4608
- 5376
- 8192
- 11520
- [2048, 2048, 13312]
- [2048, 2048, 16384]
matrix_size: 8192 # fallback shape when sweep_sizes is empty
warmup: 20
iterations: 80
# NOTE: torch.compile was dropped — on H100 eager cuBLAS beats Triton for plain
# GEMM, and compiling would re-autotune per shape and make the sweep very slow.
- fp64
- int8
matrix_size: 8192
warmup: 50
iterations: 500
use_compile: true
health:
temp_warning: 75
@ -44,9 +31,15 @@ nccl:
test_allreduce: true
test_alltoall: true
test_broadcast: true
test_reduce_scatter: false
test_allgather: false
test_sendrecv: false
test_reduce_scatter: true
test_allgather: true
test_sendrecv: true
message_sizes:
- 1M
- 256M
- 2G
repeats: 3
max_stddev_pct: 3
multinode_nccl:
enabled: false
@ -98,44 +91,66 @@ multinode_nccl:
alltoall: 75
stress:
duration_sec: 600 # 10 min — reaches thermal steady state, validates throttle/jitter beyond warmup
duration_sec: 1800
production_duration_sec: 1800
use_gpu_burn: false
use_doubles: false
use_tensor_cores: true
memory_pct: 90
gpus: all
dtype: bf16
matrix_size: 24576
telemetry_interval_sec: 1
warmup_sec: 60
min_steady_samples: 10
max_temp_c: 80
max_temp_delta_c: 5
min_power_watts: 630
max_tflops_jitter_pct: 5
require_tflops_jitter: true
rdma:
min_bandwidth_gbps: 50
max_latency_us: 10
min_bandwidth_gbps: 47
min_port_rate_gbps: 400
max_latency_us: 3.5
max_write_latency_us: 2.0
max_read_latency_us: 3.5
ib_iterations: 1000
msg_size: 65536
msg_size: 4194304
latency_msg_size: 8
ib_device: null
ib_port: 1
# Cross-node (two-host) RDMA via perftest, orchestrated over SSH from the CLIENT
# node. Replaces the old scripts/rdma_cross_node.sh. Run on the client; it starts
# ib_write_bw/ib_write_lat servers on `server` over SSH (passwordless required),
# then drives the local client per device.
cross_node:
enabled: false # set true on the client node to run cross-node RDMA
server: null # peer ssh address, e.g. 172.72.8.12 (server node)
server_addr: null # OOB addr client connects to (default: = server)
ssh_user: root
devices: [] # e.g. [mlx5_0, mlx5_1, mlx5_6, mlx5_7]; [] = auto-detect active IB
ib_port: 1
gid_index: null # -x <n> for RoCE; null for pure InfiniBand
msg_size: 1048576 # 1 MiB — large enough to reach NDR400 peak
iters: 5000
base_oob_port: 18515 # per-device OOB port = base + device index
server_warmup_sec: 2.0
min_bandwidth_gbps: 350 # per-port PASS floor (NDR400 ≈ 0.9 × 400)
max_latency_us: 5
server_addr: null
ibping_target: null
ibping_count: 5
role: auto
pfc_ecn_counters: true
nvlink:
expected_links_per_gpu: 18
expected_link_speed_gbps: 25
require_zero_errors: true
dcgm:
diag_level: 3
timeout_sec: 3600
expected_num_gpus: 8
json_output: true
require_subtests: true
training:
model: gpt2
model: synthetic_1.5b
batch_size: 8
seq_length: 2048
num_steps: 50
warmup_steps: 5
dtype: bf16
mode: ddp
synthetic_params_b: 1.5
min_tokens_per_sec: 45000
max_step_jitter_pct: 3
max_peak_memory_gb: 70
require_distributed: true
report:
output_dir: ./reports

View File

@ -312,31 +312,10 @@ class Benchmark:
def run_compute_benchmark(self, dtypes: Optional[List[str]] = None) -> dict:
comp_cfg = self.bench_cfg.get("compute", {})
configured_dtypes = dtypes or comp_cfg.get("dtypes", ["fp32", "tf32", "fp16", "bf16", "fp8"])
# MAMF-style shape sweep (à la stas00's mamf-finder): a single fixed matmul
# shape under-reports the achievable peak by ~7-12% and therefore can't meet
# the MAMF-calibrated PASS thresholds in gpu_specs.compute_pass_thresholds_tflops.
# So for each dtype we time several matmul shapes and keep the MAXIMUM TFLOPS
# (the realistic peak). matrix_size is the fallback when sweep_sizes is empty.
matrix_size = comp_cfg.get("matrix_size", 8192)
sweep_sizes = comp_cfg.get("sweep_sizes") or [matrix_size]
warmup = comp_cfg.get("warmup", 20)
iterations = comp_cfg.get("iterations", 80)
# Each sweep entry is either an int N (square N×N×N) or an [M, N, K] triple.
# Non-square / K-heavy shapes (e.g. 2048×2048×13312) reach the true MAMF peak
# on Hopper — square-only tops out ~5% lower — so the default set mixes both.
def _to_shape(entry):
if isinstance(entry, (list, tuple)):
if len(entry) == 3:
return tuple(int(x) for x in entry)
if len(entry) == 1:
n = int(entry[0])
return (n, n, n)
raise ValueError(f"sweep size {entry!r} must be an int or [M, N, K]")
n = int(entry)
return (n, n, n)
shapes = [_to_shape(e) for e in sweep_sizes]
matrix_size = comp_cfg.get("matrix_size", 4096)
warmup = comp_cfg.get("warmup", 10)
iterations = comp_cfg.get("iterations", 100)
use_compile = comp_cfg.get("use_compile", False)
if not TORCH_AVAILABLE:
self.console.print("[yellow]PyTorch not available - skipping compute benchmark[/yellow]")
@ -344,22 +323,37 @@ class Benchmark:
gpu_count = torch.cuda.device_count()
self.console.print(f"[cyan]Compute Benchmark - {gpu_count} GPU(s)[/cyan]")
if len(sweep_sizes) > 1:
self.console.print(
f"[cyan] MAMF shape sweep over {len(sweep_sizes)} sizes: "
f"{', '.join(str(s) for s in sweep_sizes)}[/cyan]"
)
# torch.compile(max-autotune) benchmarks cuBLAS vs Triton kernels and picks
# the fastest for this GPU/shape, typically improving efficiency by 8-15%.
# compile_warmup must be larger than warmup to absorb JIT + autotuning time.
mm_fn = torch.matmul
compile_warmup = warmup
if use_compile:
try:
_compiled = torch.compile(torch.matmul, mode="max-autotune")
# Trial call to trigger JIT and verify compilation succeeds before the dtype loop.
_t = torch.randn(64, 64, device="cuda", dtype=torch.float32)
_compiled(_t, _t)
torch.cuda.synchronize()
del _t
mm_fn = _compiled
compile_warmup = max(warmup, 50)
self.console.print("[cyan] torch.compile(max-autotune) enabled[/cyan]")
except Exception as e:
self.console.print(f"[yellow] torch.compile unavailable ({type(e).__name__}), using eager[/yellow]")
dtype_map = {
"fp32": (torch.float32, self.specs["fp32_tflops"]),
"tf32": ("tf32", self.specs["tf32_tflops"]),
"fp16": (torch.float16, self.specs["fp16_tflops"]),
"bf16": (torch.bfloat16, self.specs["bf16_tflops"]),
"fp8": (torch.float8_e4m3fn, self.specs["fp8_tflops"]),
"fp32": (torch.float32, self.specs.get("fp32_tflops", 0)),
"tf32": ("tf32", self.specs.get("tf32_tflops", 0)),
"fp16": (torch.float16, self.specs.get("fp16_tflops", 0)),
"bf16": (torch.bfloat16, self.specs.get("bf16_tflops", 0)),
"fp8": (getattr(torch, "float8_e4m3fn", None), self.specs.get("fp8_tflops", 0)),
"fp64": (torch.float64, self.specs.get("fp64_tflops", 0)),
"int8": (torch.int8, self.specs.get("int8_tflops", 0)),
}
results_by_dtype = {}
best_shapes = {}
per_gpu_results = [{"index": i} for i in range(gpu_count)]
with Progress(
@ -382,41 +376,27 @@ class Benchmark:
progress.advance(task)
continue
dtype_val, peak_tflops = dtype_map[dtype_name]
# allow_tf32 only affects float32 matmuls: ON for the TF32 run, OFF for
# the true-FP32 run so the two stay distinct.
old_tf32 = torch.backends.cuda.matmul.allow_tf32
if dtype_name == "tf32":
torch.backends.cuda.matmul.allow_tf32 = True
dtype_val = torch.float32
elif dtype_name == "fp32":
torch.backends.cuda.matmul.allow_tf32 = False
best_tflops, best_shape, last_err = 0.0, None, None
for (M, N, K) in shapes:
gpu_values = []
errors = []
for gpu_idx in range(gpu_count):
try:
t = self._bench_matmul_once(dtype_name, dtype_val, M, N, K, warmup, iterations)
if t > best_tflops:
best_tflops, best_shape = t, (M, N, K)
except Exception as e: # noqa: BLE001 - record and try the next shape
last_err = e
torch.backends.cuda.matmul.allow_tf32 = old_tf32
if best_shape is None:
results_by_dtype[dtype_name] = f"error: {last_err}"
self.console.print(f"[yellow] {dtype_name}: {last_err}[/yellow]")
else:
shape_str = "x".join(str(d) for d in best_shape)
results_by_dtype[dtype_name] = round(best_tflops, 1)
best_shapes[dtype_name] = shape_str
for pg in per_gpu_results:
pg[dtype_name] = round(best_tflops, 1)
if len(shapes) > 1:
self.console.print(
f"[dim] {dtype_name}: {best_tflops:.1f} TFLOPS @ {shape_str}[/dim]"
val = self._benchmark_dtype_on_gpu(
dtype_name, dtype_map[dtype_name][0], matrix_size,
warmup, compile_warmup, iterations, mm_fn, gpu_idx,
)
gpu_values.append(val)
per_gpu_results[gpu_idx][dtype_name] = round(val, 1)
except Exception as e:
errors.append(f"gpu{gpu_idx}: {e}")
per_gpu_results[gpu_idx][dtype_name] = f"error: {e}"
finally:
torch.cuda.empty_cache()
if gpu_values:
results_by_dtype[dtype_name] = round(sum(gpu_values) / len(gpu_values), 1)
else:
results_by_dtype[dtype_name] = "error: " + "; ".join(errors[:3])
self.console.print(f"[yellow] {dtype_name}: {results_by_dtype[dtype_name]}[/yellow]")
progress.advance(task)
@ -427,78 +407,119 @@ class Benchmark:
if peak_tp:
efficiency[dt] = round((achieved / peak_tp) * 100, 1)
consistency = {}
for dt in results_by_dtype:
vals = [pg.get(dt) for pg in per_gpu_results]
nums = [v for v in vals if isinstance(v, (int, float))]
if len(nums) >= 2:
mean = sum(nums) / len(nums)
spread_pct = ((max(nums) - min(nums)) / mean * 100) if mean else 0
consistency[dt] = {
"mean_tflops": round(mean, 1),
"min_tflops": round(min(nums), 1),
"max_tflops": round(max(nums), 1),
"spread_pct": round(spread_pct, 2),
"max_allowed_pct": 3,
"passed": spread_pct <= 3,
}
pass_thresholds = dict(self.specs.get("compute_pass_thresholds_tflops") or {})
threshold_passed = True
for dt, threshold in pass_thresholds.items():
val = results_by_dtype.get(dt)
if not isinstance(val, (int, float)) or val < threshold:
threshold_passed = False
break
consistency_passed = all(row.get("passed", False) for row in consistency.values()) if consistency else True
return {
"compute": {
"passed": threshold_passed and consistency_passed,
"per_dtype_tflops": results_by_dtype,
"peak_tflops": {dt: dtype_map[dt][1] for dt in dtype_map},
"efficiency_pct": efficiency,
# Absolute TFLOPS PASS thresholds (decoupled from peak). When present,
# report.py judges PASS/WARN/FAIL against these directly instead of
# using % of peak. Empty dict => fall back to legacy 80% rule.
"pass_thresholds_tflops": dict(
self.specs.get("compute_pass_thresholds_tflops") or {}
),
"pass_thresholds_tflops": pass_thresholds,
"per_gpu": per_gpu_results,
"sweep_sizes": list(sweep_sizes),
"best_shapes": best_shapes,
"consistency": consistency,
"matrix_size": matrix_size,
"warmup": warmup,
"iterations": iterations,
}
}
def _bench_matmul_once(self, dtype_name: str, dtype_val, M: int, N: int, K: int,
warmup: int, iterations: int) -> float:
"""Time one (M×K)·(K×N) matmul for a dtype and return achieved TFLOPS.
Uses an L2-cache-busting pool of matrix pairs (total > 256 MB) so operands
can't be served from L2 across iterations, and CUDA events for timing. FP8
goes through torch._scaled_mm (e4m3); all others through torch.matmul eager
cuBLAS, which on H100 beats torch.compile/Triton for plain GEMM and avoids the
per-shape recompile cost that would make a sweep pathologically slow.
"""
elem_bytes = 1 if dtype_name == "fp8" else torch.tensor([], dtype=dtype_val).element_size()
pair_bytes = (M * K + K * N) * elem_bytes
num_pools = max(4, -(-256 * 1024 * 1024 // pair_bytes)) # ceil(256MB / pair)
if dtype_name == "fp8":
if not hasattr(torch, "_scaled_mm"):
raise RuntimeError("torch._scaled_mm unavailable — upgrade to PyTorch >= 2.1")
pools_a = [torch.randn(M, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn) for _ in range(num_pools)]
pools_b = [torch.randn(N, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn) for _ in range(num_pools)]
scale_a = torch.tensor(1.0, device="cuda")
scale_b = torch.tensor(1.0, device="cuda")
def op(i):
return torch._scaled_mm(pools_a[i], pools_b[i].T, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.bfloat16)
else:
pools_a = [torch.randn(M, K, device="cuda", dtype=dtype_val) for _ in range(num_pools)]
pools_b = [torch.randn(K, N, device="cuda", dtype=dtype_val) for _ in range(num_pools)]
def op(i):
return torch.matmul(pools_a[i], pools_b[i])
def _benchmark_dtype_on_gpu(self, dtype_name: str, dtype_val, matrix_size: int,
warmup: int, compile_warmup: int, iterations: int,
mm_fn, gpu_idx: int) -> float:
if dtype_name == "fp8" and dtype_val is None:
raise RuntimeError("torch.float8_e4m3fn unavailable")
device = f"cuda:{gpu_idx}"
old_tf32 = torch.backends.cuda.matmul.allow_tf32
try:
# Probe once so a broken/unsupported kernel raises before the timed loop.
_probe = op(0)
torch.cuda.synchronize()
del _probe
with torch.cuda.device(gpu_idx):
if dtype_name == "tf32":
torch.backends.cuda.matmul.allow_tf32 = True
dtype_val = torch.float32
for i in range(warmup):
op(i % num_pools)
torch.cuda.synchronize()
M = N = K = matrix_size
if dtype_name == "int8" and M > 4096:
# torch._int_mm on 8192 can be extremely memory hungry because the
# output is int32. Keep it production-visible, but bounded.
M = N = K = 4096
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
for i in range(iterations):
op(i % num_pools)
end_event.record()
torch.cuda.synchronize()
elapsed_ms = start_event.elapsed_time(end_event)
elem_bytes = 1 if dtype_name in ("fp8", "int8") else torch.tensor([], dtype=dtype_val).element_size()
pair_bytes = 2 * M * K * elem_bytes
num_pools = max(4, -(-256 * 1024 * 1024 // pair_bytes))
if dtype_name == "fp8":
if not hasattr(torch, "_scaled_mm"):
raise RuntimeError("torch._scaled_mm unavailable")
pools_a = [torch.randn(M, K, device=device, dtype=torch.float32).to(torch.float8_e4m3fn) for _ in range(num_pools)]
pools_b = [torch.randn(N, K, device=device, dtype=torch.float32).to(torch.float8_e4m3fn) for _ in range(num_pools)]
scale_a = torch.tensor(1.0, device=device)
scale_b = torch.tensor(1.0, device=device)
def run(i):
return torch._scaled_mm(pools_a[i], pools_b[i].T, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.bfloat16)
effective_warmup = warmup
elif dtype_name == "int8":
if not hasattr(torch, "_int_mm"):
raise RuntimeError("torch._int_mm unavailable")
pools_a = [torch.randint(-128, 127, (M, K), device=device, dtype=torch.int8) for _ in range(num_pools)]
pools_b = [torch.randint(-128, 127, (K, N), device=device, dtype=torch.int8) for _ in range(num_pools)]
def run(i):
return torch._int_mm(pools_a[i], pools_b[i])
effective_warmup = warmup
else:
pools_a = [torch.randn(M, K, device=device, dtype=dtype_val) for _ in range(num_pools)]
pools_b = [torch.randn(K, N, device=device, dtype=dtype_val) for _ in range(num_pools)]
def run(i):
return mm_fn(pools_a[i], pools_b[i])
effective_warmup = compile_warmup
for i in range(effective_warmup):
run(i % num_pools)
torch.cuda.synchronize()
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
for i in range(iterations):
c = run(i % num_pools)
end_event.record()
torch.cuda.synchronize()
elapsed_ms = start_event.elapsed_time(end_event)
del pools_a, pools_b, c
flops = 2 * M * N * K * iterations
return flops / (elapsed_ms / 1000) / 1e12
finally:
del pools_a, pools_b
torch.cuda.empty_cache()
return (2 * M * N * K * iterations) / (elapsed_ms / 1000) / 1e12
torch.backends.cuda.matmul.allow_tf32 = old_tf32
@staticmethod
def print_results(results: dict, console: Console = None):
@ -583,77 +604,24 @@ class Benchmark:
f"[{ec}]{ef:.1f}%[/{ec}]")
c.print(table)
@staticmethod
def judge_compute(results: dict) -> dict:
"""Judge compute results against pass_thresholds_tflops.
Single source of truth for the PASS/WARN/FAIL rule (same one report.py uses):
achieved >= thr -> PASS; >= 0.9*thr -> WARN; else FAIL. A string achieved value
(skipped/error) -> SKIP. A dtype without a threshold falls back to efficiency
(>=80 PASS / >=50 WARN / else FAIL).
Returns {"rows": [(dtype, achieved, threshold, status), ...], "verdict": str}.
"""
comp = results.get("compute", results)
per_dtype = comp.get("per_dtype_tflops", {})
thresholds = comp.get("pass_thresholds_tflops", {}) or {}
eff = comp.get("efficiency_pct", {})
rank = {"PASS": 0, "WARN": 1, "FAIL": 2, "SKIP": 0}
rows, verdict = [], "PASS"
for dt, val in per_dtype.items():
thr = thresholds.get(dt)
if isinstance(val, str):
status = "SKIP"
elif thr:
status = "PASS" if val >= thr else ("WARN" if val >= thr * 0.9 else "FAIL")
else:
e = eff.get(dt, 0)
status = "PASS" if e >= 80 else ("WARN" if e >= 50 else "FAIL")
rows.append((dt, val, thr, status))
if rank[status] > rank[verdict]:
verdict = status
return {"rows": rows, "verdict": verdict}
@staticmethod
def print_compute_verdict(results: dict, console: Console = None) -> str:
"""Print the PASS/WARN/FAIL table for compute results; return the verdict."""
c = console or Console()
judged = Benchmark.judge_compute(results)
color = {"PASS": "green", "WARN": "yellow", "FAIL": "red", "SKIP": "dim"}
c.print("\n[bold cyan]Compute Verdict (vs thresholds)[/bold cyan]")
for dt, val, thr, status in judged["rows"]:
val_s = f"{val:.1f}" if isinstance(val, (int, float)) else str(val)
thr_s = f">= {thr}" if thr else "(efficiency)"
c.print(f" {dt.upper():>4}: {val_s:>8} {thr_s:<12} [{color[status]}]{status}[/{color[status]}]")
v = judged["verdict"]
c.print(f" [bold]VERDICT: [{color[v]}]{v}[/{color[v]}][/bold]")
return v
def _run_cli() -> None:
"""`python -m modules.benchmark` — run ONLY the compute-throughput benchmark."""
import argparse
from pathlib import Path
import yaml
repo_root = Path(__file__).resolve().parent.parent
parser = argparse.ArgumentParser(description="Run the compute-throughput benchmark only.")
parser.add_argument("--config", default=str(repo_root / "configs" / "default.yaml"),
help="path to config YAML (default: configs/default.yaml)")
parser.add_argument("--json", action="store_true", help="also print raw JSON of the compute results")
args = parser.parse_args()
with open(args.config) as f:
config = yaml.safe_load(f) or {}
results = Benchmark(config).run_compute_benchmark()
Benchmark.print_results(results)
Benchmark.print_compute_verdict(results)
if args.json:
print("JSON_RESULT:" + json.dumps(results["compute"]))
if __name__ == "__main__":
_run_cli()
consistency = comp.get("consistency", {})
if consistency:
t_cons = Table(title="Per-GPU Consistency", box=None, padding=(0, 1))
t_cons.add_column("DType", style="bold")
t_cons.add_column("Min", justify="right")
t_cons.add_column("Mean", justify="right")
t_cons.add_column("Max", justify="right")
t_cons.add_column("Spread", justify="right")
t_cons.add_column("Status", justify="right")
for dt, row in consistency.items():
status = "PASS" if row.get("passed") else "FAIL"
color = "green" if row.get("passed") else "red"
t_cons.add_row(
dt.upper(),
f"{row.get('min_tflops', 0):.1f}",
f"{row.get('mean_tflops', 0):.1f}",
f"{row.get('max_tflops', 0):.1f}",
f"{row.get('spread_pct', 0):.2f}%",
f"[{color}]{status}[/{color}]",
)
c.print(t_cons)

View File

@ -11,7 +11,6 @@ GPU_NAME_PATTERNS = {
"A100": "a100",
"A800": "a800",
"H100": "h100",
"H800": "h800", # H800 = H100 SXM with NVLink halved (400 GB/s) and FP64 restricted
"H200": "h200",
"H20": "h20", # H20 / H20-3e is the China-compliance export variant, REDUCED peaks
"B200": "b200",
@ -36,17 +35,11 @@ GPU_SPECS = {
"fp16_tflops": 990, # dense (1979 sparse w/ 2:4)
"bf16_tflops": 990, # dense
"fp8_tflops": 1979, # dense
"fp64_tflops": 67,
"int8_tflops": 1979,
"compute_pass_thresholds_tflops": {
# Recalibrated 2026-05-25 to the H100 eager-cuBLAS achievable floor (each
# threshold ~2-4% below the sustained value measured across 16 GPUs via the
# MAMF shape sweep: fp32 ~52 / tf32 ~405 / fp16 ~732-748 / bf16 ~747-758 /
# fp8 ~1248-1271). The old marketing/MAMF-derived values (fp32 54, tf32 444,
# fp16 734, bf16 745, fp8 1400) sat ON or ABOVE what PyTorch cuBLAS reaches
# on H100, so healthy cards flaked to WARN/FAIL. fp8 1400 in particular was
# an H200/rowwise-scaling figure; H100 tensorwise _scaled_mm tops out ~1310.
"fp32": 50, "tf32": 385, "fp16": 720, "bf16": 730, "fp8": 1200,
# FP64 63 / INT8 1536 — listed for documentation; benchmark module
# doesn't currently exercise these dtypes.
"fp32": 54, "tf32": 444, "fp16": 734, "bf16": 745, "fp8": 1400,
"fp64": 63, "int8": 1536,
},
"tdp_watts": 700,
"nvlink_gen": 4,
@ -67,51 +60,10 @@ GPU_SPECS = {
"fp16_tflops": 990, # dense
"bf16_tflops": 990, # dense
"fp8_tflops": 1979, # dense
# PASS thresholds aligned with H200_production_acceptance.md v2 (2026-05-21):
# calibrated against Semianalysis & stas00 MAMF — H200 shares H100 SMs so
# achievable TFLOPS in PyTorch is in the same band.
"compute_pass_thresholds_tflops": {
"fp32": 50, "tf32": 400, "fp16": 720, "bf16": 720, "fp8": 1400,
},
"tdp_watts": 700,
"nvlink_gen": 4,
"nvlink_bandwidth_gbps": 900,
"pcie_gen": 5,
"min_driver_version": "545",
"min_cuda_version": "12.4",
},
"h800": {
# H800 = China-compliance export variant of H100 SXM5. SAME chip / SMs /
# clocks / HBM as H100 SXM5 — Tensor Core peaks (FP16 / BF16 / FP8 / TF32 /
# FP32) are identical to H100. Two restrictions vs H100:
# 1. NVLink bandwidth halved: 400 GB/s bidirectional (vs H100 900 GB/s)
# 2. FP64 throughput severely cut to ~1 TFLOPS (vs H100 34/67 TFLOPS)
# All other interfaces (PCIe Gen5, NVSwitch, HBM3 80GB @ 3.35 TB/s) match H100.
# NCCL multi-GPU thresholds MUST be downscaled because NVLink BW is halved.
"full_name": "NVIDIA H800 SXM5",
"architecture": "Hopper",
"compute_capability": 9.0,
"hbm_capacity_gb": 80,
"hbm_type": "HBM3",
"memory_bandwidth_gbps": 3350, # GB/s (3.35 TB/s) — same as H100 SXM
"fp32_tflops": 67,
"tf32_tflops": 495, # dense (same as H100)
"fp16_tflops": 990, # dense (same as H100)
"bf16_tflops": 990, # dense (same as H100)
"fp8_tflops": 1979, # dense (same as H100)
# Tensor Core peaks identical to H100, so PASS thresholds reuse the H100
# eager-cuBLAS calibration (2026-05-25). Measured on 8×H800: fp32 ~52 /
# tf32 ~420 / fp16 ~741 / bf16 ~745 / fp8 ~1249 — all clear these. fp8 was
# 1400 (an H200/rowwise-scaling figure) which PyTorch tensorwise _scaled_mm
# can't reach on H100-class silicon (~1310 ceiling); lowered to 1200 to match
# h100. FP64 deliberately NOT listed — H800 is restricted to ~1 TFLOPS FP64.
"compute_pass_thresholds_tflops": {
"fp32": 50, "tf32": 385, "fp16": 720, "bf16": 730, "fp8": 1200,
},
"tdp_watts": 700,
"nvlink_gen": 4,
"nvlink_bandwidth_gbps": 400, # bidirectional — HALF of H100 (export restriction)
"pcie_gen": 5,
"min_driver_version": "535",
"min_cuda_version": "12.1",
},

View File

@ -1,15 +1,17 @@
"""RDMA / InfiniBand bandwidth and latency test module."""
import glob
import os
import shutil
import subprocess
import time
from datetime import datetime
from typing import Optional, List
from rich.console import Console
from rich.table import Table
from modules.gpu_specs import resolve_tools_dir
class RDMATest:
@ -17,11 +19,24 @@ class RDMATest:
self.config = config
self.console = Console()
self.rdma_cfg = config.get("rdma", {})
self.tools_dir = resolve_tools_dir(config)
def _find_tool(self, name: str) -> Optional[str]:
p = shutil.which(name)
if p:
return p
candidates = [
os.path.join(self.tools_dir, "perftest", name),
os.path.join(self.tools_dir, "perftest", "bin", name),
os.path.join(self.tools_dir, "rdma", name),
os.path.join(self.tools_dir, name),
]
for path in candidates:
if os.path.isfile(path) and os.access(path, os.X_OK):
return path
for path in glob.glob(os.path.join(self.tools_dir, "**", name), recursive=True):
if os.path.isfile(path) and os.access(path, os.X_OK):
return path
return None
def _get_ib_devices(self) -> List[str]:
@ -101,26 +116,40 @@ class RDMATest:
self.console.print(f"[cyan]RDMA Test - Devices: {', '.join(devices)}[/cyan]")
bw_results = self._run_bandwidth_tests(devices)
latency_results = self._run_latency_tests(devices)
active_pairs = [
(dev, port) for dev, port in ib_devices
if "ACTIVE" in self._read_sys(f"/sys/class/infiniband/{dev}/ports/{port}/state").upper()
]
port_checks = self._evaluate_port_checks(device_info)
test_devices = [dev for dev, _ in active_pairs]
bw_results = self._run_bandwidth_tests(test_devices)
latency_results = self._run_latency_tests(test_devices)
ibping_results = self._run_ibping_tests(active_pairs)
fabric_counters = self._collect_pfc_ecn_counters() if self.rdma_cfg.get("pfc_ecn_counters", True) else {}
failures = self._failure_reasons(port_checks, bw_results, latency_results, ibping_results, fabric_counters)
fabric_counters_missing = (
self.rdma_cfg.get("pfc_ecn_counters", True)
and fabric_counters
and not fabric_counters.get("counters")
)
all_passed = all(
r.get("status") == "PASS"
for r in bw_results + latency_results
for r in bw_results + latency_results + ibping_results
if isinstance(r, dict)
)
) and all(p.get("status") == "PASS" for p in port_checks) and not fabric_counters.get("failed", False) and not fabric_counters_missing
result = {
return {
"passed": all_passed,
"devices": device_info,
"port_checks": port_checks,
"bandwidth_tests": bw_results,
"latency_tests": latency_results,
"ibping_tests": ibping_results,
"fabric_counters": fabric_counters,
"failures": failures,
"timestamp": datetime.now().isoformat(),
}
# Cross-node (two-host) RDMA, run only when a peer is configured.
if (self.rdma_cfg.get("cross_node", {}) or {}).get("enabled"):
result["cross_node"] = self.run_cross_node()
return result
def _collect_device_info(self, devices: List[str]) -> List[dict]:
info = []
@ -141,11 +170,83 @@ class RDMATest:
port_info[label] = f.read().strip()
except (FileNotFoundError, PermissionError):
port_info[label] = "N/A"
port_info["link_layer"] = self._read_sys(
f"/sys/class/infiniband/{dev}/ports/{port}/link_layer"
) or "N/A"
dev_info["ports"].append(port_info)
info.append(dev_info)
return info
def _evaluate_port_checks(self, device_info: List[dict]) -> List[dict]:
checks = []
min_rate = float(self.rdma_cfg.get("min_port_rate_gbps", 400))
for dev in device_info:
for port in dev.get("ports", []):
if port.get("link_layer") != "InfiniBand":
continue
state = port.get("state", "")
rate = port.get("rate", "")
rate_gbps = self._parse_rate_gbps(rate)
status = "PASS" if "ACTIVE" in state.upper() and rate_gbps >= min_rate else "FAIL"
checks.append({
"device": dev.get("name"),
"port": port.get("port"),
"state": state,
"rate": rate,
"rate_gbps": rate_gbps,
"min_rate_gbps": min_rate,
"status": status,
})
return checks
@staticmethod
def _parse_rate_gbps(rate: str) -> float:
# Example: "400 Gb/sec (4X NDR)"
try:
return float(str(rate).split()[0])
except (ValueError, IndexError, AttributeError):
return 0.0
@staticmethod
def _failure_reasons(port_checks: List[dict], bw_results: List[dict],
latency_results: List[dict], ibping_results: List[dict],
fabric_counters: dict) -> List[str]:
failures = []
for p in port_checks:
if p.get("status") != "PASS":
failures.append(
f"{p.get('device')} port {p.get('port')} state/rate failed "
f"({p.get('state')}, {p.get('rate')}; required >= {p.get('min_rate_gbps')}Gbps ACTIVE)"
)
for r in bw_results:
if r.get("status") != "PASS":
if r.get("error"):
failures.append(f"{r.get('test')} failed: {r.get('error')}")
else:
failures.append(
f"{r.get('test')} bandwidth {r.get('bandwidth_gbps', 0)}GB/s "
f"< {r.get('min_required_gbps', 'N/A')}GB/s"
)
for r in latency_results:
if r.get("status") != "PASS":
if r.get("error"):
failures.append(f"{r.get('test')} failed: {r.get('error')}")
else:
failures.append(
f"{r.get('test')} latency {r.get('latency_us', 0)}us "
f"> {r.get('max_allowed_us', 'N/A')}us"
)
for r in ibping_results:
if r.get("status") != "PASS":
failures.append(f"{r.get('test')} failed: {r.get('error') or r.get('output_tail', '')[:120]}")
if fabric_counters.get("failed"):
nonzero = [f"{k}={v}" for k, v in fabric_counters.get("counters", {}).items() if v]
failures.append("non-zero PFC/ECN/CNP/congestion counters: " + ", ".join(nonzero[:10]))
elif fabric_counters and not fabric_counters.get("counters"):
failures.append("PFC/ECN/CNP/congestion counters not found; fabric counter evidence missing")
return failures
def _run_ib_command(self, cmd: List[str], timeout: int = 60) -> dict:
try:
r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
@ -168,45 +269,70 @@ class RDMATest:
iters = self.rdma_cfg.get("ib_iterations", 1000)
dx = self.rdma_cfg.get("ib_device", None)
port = self.rdma_cfg.get("ib_port", 1)
server_addr = self.rdma_cfg.get("server_addr") or os.environ.get("RDMA_SERVER_ADDR")
role = self.rdma_cfg.get("role", "auto")
for tool, label in [(ib_write_bw, "ib_write_bw"), (ib_read_bw, "ib_read_bw")]:
if not tool:
results.append({"test": label, "status": "SKIP", "error": "not installed"})
results.append({"test": label, "status": "FAIL", "error": "not installed"})
continue
if role == "client" and not server_addr:
results.append({
"test": label,
"status": "FAIL",
"error": "rdma.role=client requires rdma.server_addr or RDMA_SERVER_ADDR",
"role": "client",
})
continue
server_cmd = [tool, "-d", dx or devices[0], "-i", str(port), "-s", str(msg_size)]
client_cmd = server_cmd + ["localhost"]
server_cmd = [tool, "-d", dx or devices[0], "-i", str(port), "-s", str(msg_size), "-n", str(iters)]
client_cmd = server_cmd + [server_addr or "localhost"]
server = subprocess.Popen(server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
import time
time.sleep(1)
if role == "server":
results.append(self._run_server_mode(label, server_cmd))
continue
server = None
if not server_addr and role != "client":
server = subprocess.Popen(server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
import time
time.sleep(1)
try:
client = subprocess.run(client_cmd, capture_output=True, text=True, timeout=60)
server.wait(timeout=10)
if server:
server.wait(timeout=10)
output = client.stdout + server.stdout.read() if server.stdout else ""
bw_mbps = 0
output = client.stdout
if server and server.stdout:
output += server.stdout.read()
bw_mibps = 0
for line in output.split("\n"):
line = line.strip()
if not line:
continue
parts = line.split()
try:
bw_mbps = max(bw_mbps, float(parts[-1]))
if len(parts) >= 5 and int(parts[0]) == int(msg_size):
# perftest bandwidth rows:
# #bytes #iterations BW peak[MiB/sec] BW average[MiB/sec] MsgRate[Mpps]
bw_mibps = max(bw_mibps, float(parts[3]))
except (ValueError, IndexError):
continue
bw_gbps = bw_mbps / 1000 if bw_mbps else 0
status = "PASS" if bw_gbps >= min_bw else "WARN"
bw_gbps = bw_mibps * 1024 * 1024 / 1e9 if bw_mibps else 0
status = "PASS" if bw_gbps >= min_bw else "FAIL"
results.append({
"test": label,
"status": status,
"bandwidth_gbps": round(bw_gbps, 2),
"min_required_gbps": min_bw,
"msg_size": msg_size,
"role": "client" if server_addr else "local_loopback",
})
except Exception as e:
server.kill()
if server:
server.kill()
results.append({"test": label, "status": "FAIL", "error": str(e)})
return results
@ -216,240 +342,214 @@ class RDMATest:
ib_write_lat = self._find_tool("ib_write_lat")
ib_read_lat = self._find_tool("ib_read_lat")
max_lat_us = self.rdma_cfg.get("max_latency_us", 10)
max_by_test = {
"ib_write_lat": self.rdma_cfg.get("max_write_latency_us", max_lat_us),
"ib_read_lat": self.rdma_cfg.get("max_read_latency_us", max_lat_us),
}
dx = self.rdma_cfg.get("ib_device", None)
port = self.rdma_cfg.get("ib_port", 1)
msg_size = self.rdma_cfg.get("latency_msg_size", 8)
iters = self.rdma_cfg.get("ib_iterations", 1000)
server_addr = self.rdma_cfg.get("server_addr") or os.environ.get("RDMA_SERVER_ADDR")
role = self.rdma_cfg.get("role", "auto")
for tool, label in [(ib_write_lat, "ib_write_lat"), (ib_read_lat, "ib_read_lat")]:
if not tool:
results.append({"test": label, "status": "SKIP", "error": "not installed"})
results.append({"test": label, "status": "FAIL", "error": "not installed"})
continue
if role == "client" and not server_addr:
results.append({
"test": label,
"status": "FAIL",
"error": "rdma.role=client requires rdma.server_addr or RDMA_SERVER_ADDR",
"role": "client",
})
continue
server_cmd = [tool, "-d", dx or devices[0], "-i", str(port)]
client_cmd = server_cmd + ["localhost"]
server_cmd = [tool, "-d", dx or devices[0], "-i", str(port), "-s", str(msg_size), "-n", str(iters)]
client_cmd = server_cmd + [server_addr or "localhost"]
server = subprocess.Popen(server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
import time
time.sleep(1)
if role == "server":
results.append(self._run_server_mode(label, server_cmd))
continue
server = None
if not server_addr and role != "client":
server = subprocess.Popen(server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
import time
time.sleep(1)
try:
client = subprocess.run(client_cmd, capture_output=True, text=True, timeout=60)
server.wait(timeout=10)
if server:
server.wait(timeout=10)
output = client.stdout + server.stdout.read() if server.stdout else ""
output = client.stdout
if server and server.stdout:
output += server.stdout.read()
lat_us = 0
for line in output.split("\n"):
parts = line.strip().split()
try:
lat_us = max(lat_us, float(parts[-1]))
if len(parts) >= 6:
int(parts[0])
int(parts[1])
# perftest latency rows:
# #bytes #iterations t_min t_max t_typical t_avg t_stdev p99 p99.9
lat_us = max(lat_us, float(parts[5]))
except (ValueError, IndexError):
continue
status = "PASS" if 0 < lat_us <= max_lat_us else ("WARN" if lat_us > 0 else "FAIL")
max_allowed = max_by_test[label]
status = "PASS" if 0 < lat_us <= max_allowed else "FAIL"
results.append({
"test": label,
"status": status,
"latency_us": round(lat_us, 2),
"max_allowed_us": max_lat_us,
"max_allowed_us": max_allowed,
"msg_size": msg_size,
"role": "client" if server_addr else "local_loopback",
})
except Exception as e:
server.kill()
if server:
server.kill()
results.append({"test": label, "status": "FAIL", "error": str(e)})
return results
# ------------------------------------------------------------------
# Cross-node (two-host) RDMA over perftest, orchestrated via SSH.
# Runs FROM the client host: for each IB device it launches the matching
# perftest server on the peer over SSH (held open in a live ssh channel),
# then runs the local client against the peer's OOB address and parses the
# result. Replaces the old standalone scripts/rdma_cross_node.sh.
# ------------------------------------------------------------------
def _active_ib_devices(self) -> List[str]:
"""IB devices whose port 1 is InfiniBand link_layer and ACTIVE."""
out = []
for dev in self._get_ib_devices():
for port in self._get_ib_ports(dev):
ll = self._read_sys(f"/sys/class/infiniband/{dev}/ports/{port}/link_layer")
st = self._read_sys(f"/sys/class/infiniband/{dev}/ports/{port}/state")
if ll == "InfiniBand" and "ACTIVE" in st.upper():
out.append(dev)
break
return out
def run_cross_node(self) -> dict:
cn = self.rdma_cfg.get("cross_node", {}) or {}
if not cn.get("enabled"):
return {"status": "SKIP", "skipped": True,
"reason": "rdma.cross_node.enabled is false"}
server = cn.get("server")
if not server:
return {"status": "SKIP", "skipped": True,
"reason": "rdma.cross_node.server (peer ssh address) not set"}
ssh_user = cn.get("ssh_user", "root")
server_target = server if "@" in server else f"{ssh_user}@{server}"
# OOB address the client's perftest connects to (defaults to the ssh host).
server_addr = cn.get("server_addr") or server.split("@")[-1]
ib_port = cn.get("ib_port", 1)
gid_index = cn.get("gid_index")
msg_size = cn.get("msg_size", 1048576)
iters = cn.get("iters", 5000)
base_port = cn.get("base_oob_port", 18515)
warmup = cn.get("server_warmup_sec", 2.0)
min_bw = cn.get("min_bandwidth_gbps", 350)
max_lat = cn.get("max_latency_us", 5)
devices = cn.get("devices") or self._active_ib_devices()
if not devices:
return {"status": "SKIP", "skipped": True,
"reason": "no active InfiniBand devices to test"}
has_bw = self._find_tool("ib_write_bw") is not None
has_lat = self._find_tool("ib_write_lat") is not None
if not has_bw and not has_lat:
return {"status": "SKIP", "skipped": True,
"reason": "perftest (ib_write_bw / ib_write_lat) not installed"}
self.console.print(
f"[cyan]Cross-node RDMA — client → {server_addr}, "
f"devices: {', '.join(devices)}[/cyan]")
per_device = []
for idx, dev in enumerate(devices):
oob = base_port + idx
entry = {"device": dev}
if has_bw:
bw = self._cross_node_perftest(
"ib_write_bw", dev, server_target, server_addr, ib_port,
oob, gid_index, warmup,
extra=["--report_gbits", "-s", str(msg_size), "-n", str(iters)],
parse="bw")
entry["bandwidth_gbps"] = bw
if isinstance(bw, (int, float)):
entry["bw_status"] = "PASS" if bw >= min_bw else "WARN"
else:
entry["bw_status"] = "FAIL"
if has_lat:
lat = self._cross_node_perftest(
"ib_write_lat", dev, server_target, server_addr, ib_port,
oob, gid_index, warmup, extra=[], parse="lat")
if isinstance(lat, dict):
entry["latency_us"] = lat.get("typical")
entry["latency_p99_us"] = lat.get("p99")
t = lat.get("typical")
entry["lat_status"] = ("PASS" if isinstance(t, (int, float)) and 0 < t <= max_lat
else ("WARN" if isinstance(t, (int, float)) else "FAIL"))
else:
entry["latency_us"] = lat
entry["lat_status"] = "FAIL"
per_device.append(entry)
statuses = [e.get(k) for e in per_device for k in ("bw_status", "lat_status") if e.get(k)]
verdict = "PASS"
for s in statuses:
if s == "FAIL":
verdict = "FAIL"
break
if s == "WARN" and verdict == "PASS":
verdict = "WARN"
return {
"status": verdict,
"server": server_addr,
"min_bandwidth_gbps": min_bw,
"max_latency_us": max_lat,
"per_device": per_device,
"timestamp": datetime.now().isoformat(),
}
def _cross_node_perftest(self, tool: str, dev: str, server_target: str,
server_addr: str, ib_port: int, oob_port: int,
gid_index, warmup: float, extra: List[str], parse: str):
"""Start `tool` server on the peer via SSH, run the local client, parse output.
Returns a float (bw, Gb/s), a dict {typical, p99} (lat, µs), or an error string.
"""
tool_path = self._find_tool(tool)
if not tool_path:
return f"{tool} not installed"
flags = ["-d", dev, "-i", str(ib_port), "-p", str(oob_port), "-F"]
if gid_index is not None:
flags += ["-x", str(gid_index)]
flags += extra
server_cmd = " ".join([tool] + flags) # server: no host argument
server_proc = None
def _run_server_mode(self, label: str, server_cmd: List[str]) -> dict:
timeout = int(self.rdma_cfg.get("server_timeout_sec", 120))
try:
server_proc = subprocess.Popen(
["ssh", "-o", "BatchMode=yes", "-o", "StrictHostKeyChecking=no",
server_target, server_cmd],
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
time.sleep(warmup) # let the remote server bind before the client connects
client = subprocess.run([tool_path] + flags + [server_addr],
capture_output=True, text=True, timeout=120)
out = client.stdout + "\n" + (client.stderr or "")
return self._parse_perftest_lat(out) if parse == "lat" else self._parse_perftest_bw(out)
r = subprocess.run(server_cmd, capture_output=True, text=True, timeout=timeout)
return {
"test": label,
"status": "PASS" if r.returncode == 0 else "FAIL",
"role": "server",
"server_timeout_sec": timeout,
"output_tail": (r.stdout + r.stderr)[-500:],
}
except subprocess.TimeoutExpired:
return "timeout"
except Exception as e: # noqa: BLE001
return f"error: {e}"
finally:
if server_proc and server_proc.poll() is None:
server_proc.terminate()
return {
"test": label,
"status": "PASS",
"role": "server",
"server_timeout_sec": timeout,
"note": "server ran until timeout waiting for client",
}
except Exception as e:
return {"test": label, "status": "FAIL", "role": "server", "error": str(e)}
def _run_ibping_tests(self, active_pairs: List[tuple[str, str]]) -> List[dict]:
tool = self._find_tool("ibping")
if not tool:
return [{"test": "ibping", "status": "FAIL", "error": "not installed"}]
if not active_pairs:
return [{"test": "ibping", "status": "FAIL", "error": "no active IB ports"}]
dev, port = active_pairs[0]
target = (
self.rdma_cfg.get("ibping_target")
or os.environ.get("IBPING_TARGET")
)
count = int(self.rdma_cfg.get("ibping_count", 5))
role = self.rdma_cfg.get("role", "auto")
server_addr = self.rdma_cfg.get("server_addr") or os.environ.get("RDMA_SERVER_ADDR")
base = [tool, "-C", dev, "-P", str(port)]
if role == "server":
return [self._run_server_mode("ibping", [*base, "-S"])]
server = None
if not target and role != "client":
target = self._read_sys(f"/sys/class/infiniband/{dev}/ports/{port}/lid")
server = subprocess.Popen([*base, "-S"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
import time
time.sleep(1)
if not target:
reason = "no ibping target/lid"
if role == "client" or server_addr:
reason = (
"cross-node ibping requires rdma.ibping_target or IBPING_TARGET "
"(peer LID/GID; rdma.server_addr is only for perftest TCP bootstrap)"
)
return [{"test": "ibping", "status": "FAIL", "error": reason}]
try:
r = subprocess.run([*base, "-c", str(count), str(target)], capture_output=True, text=True, timeout=30)
if server:
server.terminate()
try:
server_proc.wait(timeout=5)
server.wait(timeout=5)
except subprocess.TimeoutExpired:
server.kill()
output = r.stdout + r.stderr
failed = r.returncode != 0 or "failed" in output.lower()
return [{
"test": "ibping",
"status": "FAIL" if failed else "PASS",
"role": "client" if server_addr or role == "client" else "local_loopback",
"direction": "outbound_to_peer" if server_addr or role == "client" else "local_loopback",
"target": str(target),
"count": count,
"output_tail": output[-500:],
}]
except Exception as e:
if server:
server.kill()
return [{"test": "ibping", "status": "FAIL", "error": str(e)}]
def _collect_pfc_ecn_counters(self) -> dict:
counters = {}
failed = False
keywords = ("pfc", "ecn", "cnp", "congestion")
for root, _, files in os.walk("/sys/class/infiniband"):
for name in files:
lower = name.lower()
if not any(k in lower for k in keywords):
continue
path = os.path.join(root, name)
val = self._read_sys(path)
try:
num = int(val)
except ValueError:
continue
rel = path.replace("/sys/class/infiniband/", "")
counters[rel] = num
if num != 0:
failed = True
ethtool = shutil.which("ethtool")
net_dir = "/sys/class/net"
if ethtool and os.path.isdir(net_dir):
for iface in sorted(os.listdir(net_dir)):
try:
r = subprocess.run(
[ethtool, "-S", iface],
capture_output=True,
text=True,
timeout=10,
)
except Exception:
server_proc.kill()
# ib_write_* server normally exits after one run; pkill cleans up a
# leftover one if the client failed mid-handshake. -x matches the exact
# process name so it never kills this ssh command itself.
try:
subprocess.run(
["ssh", "-o", "BatchMode=yes", server_target, f"pkill -x {tool}"],
capture_output=True, timeout=10)
except Exception:
pass
@staticmethod
def _parse_perftest_bw(output: str) -> float:
"""Parse ib_write_bw rows (#bytes #iter BW_peak BW_avg ...); return max BW avg."""
best = 0.0
for line in output.splitlines():
parts = line.split()
if len(parts) >= 4:
try:
int(parts[0]) # #bytes column
best = max(best, float(parts[3])) # BW average[Gb/sec]
except ValueError:
continue
return round(best, 2) if best else 0.0
@staticmethod
def _parse_perftest_lat(output: str) -> dict:
"""Parse ib_write_lat row (#bytes #iter t_min t_max t_typical t_avg ... 99%)."""
for line in output.splitlines():
parts = line.split()
if len(parts) >= 6:
try:
int(parts[0]); int(parts[1])
typical = float(parts[4]) # t_typical[usec]
except ValueError:
if r.returncode != 0:
continue
p99 = None
if len(parts) >= 8:
for line in r.stdout.splitlines():
if ":" not in line:
continue
key, value = line.split(":", 1)
key = key.strip()
lower = key.lower()
if not any(k in lower for k in keywords):
continue
try:
p99 = float(parts[7]) # 99% percentile[usec]
except ValueError:
p99 = None
return {"typical": round(typical, 2), "p99": round(p99, 2) if p99 else None}
return {"typical": None, "p99": None}
num = int(value.strip().split()[0])
except (ValueError, IndexError):
continue
counters[f"net/{iface}/{key}"] = num
if num != 0:
failed = True
return {"failed": failed, "counters": counters}
@staticmethod
def print_results(results: dict, console: Console = None):
@ -496,28 +596,10 @@ class RDMATest:
f"({lat:.2f} us, max: {t.get('max_allowed_us', 'N/A')} us)" if status != "SKIP"
else f" {t['test']}: [dim]SKIPPED[/dim]")
cn = results.get("cross_node")
if cn:
if cn.get("skipped"):
c.print(f"\n [bold]Cross-node RDMA[/bold]: [dim]SKIPPED "
f"({cn.get('reason', '')})[/dim]")
else:
v = cn.get("status", "?")
vc = "green" if v == "PASS" else ("yellow" if v == "WARN" else "red")
c.print(f"\n [bold]Cross-node RDMA[/bold] (server {cn.get('server')}) "
f"[{vc}]{v}[/{vc}] "
f"[dim]min {cn.get('min_bandwidth_gbps')} Gb/s, "
f"max {cn.get('max_latency_us')} µs[/dim]")
for e in cn.get("per_device", []):
bw = e.get("bandwidth_gbps")
lat = e.get("latency_us")
bws = e.get("bw_status", "")
lts = e.get("lat_status", "")
bc = "green" if bws == "PASS" else ("yellow" if bws == "WARN" else "red")
lc = "green" if lts == "PASS" else ("yellow" if lts == "WARN" else "red")
bw_s = f"{bw:.1f} Gb/s" if isinstance(bw, (int, float)) else str(bw)
lat_s = f"{lat:.2f} µs" if isinstance(lat, (int, float)) else str(lat)
p99 = e.get("latency_p99_us")
p99_s = f", p99 {p99:.2f}" if isinstance(p99, (int, float)) else ""
c.print(f" {e['device']}: BW [{bc}]{bw_s}[/{bc}] | "
f"lat [{lc}]{lat_s}[/{lc}]{p99_s}")
ibping_tests = results.get("ibping_tests", [])
if ibping_tests:
c.print("\n [bold]IB Ping Tests[/bold]")
for t in ibping_tests:
status = t.get("status", "FAIL")
sc = "green" if status == "PASS" else "red"
c.print(f" {t['test']}: [{sc}]{status}[/{sc}] target={t.get('target', 'N/A')}")

View File

@ -1,87 +0,0 @@
# cuBLASLt FP8 GEMM Cross-Check Report
Date: 2026-05-24
Scope: Validate whether the single-node FP8 compute FAIL is caused by hardware/platform limits or by the original PyTorch `_scaled_mm` benchmark path.
## Method
Added a direct cuBLASLt FP8 GEMM micro-benchmark:
- Source: `scripts/cublaslt_fp8_gemm_bench.cu`
- Wrapper: `scripts/run_cublaslt_fp8_gemm.sh`
- Input dtype: `CUDA_R_8F_E4M3`
- Output dtype: `CUDA_R_16BF`
- Accumulate / compute type: `CUBLAS_COMPUTE_32F`
- Layout: cuBLASLt FP8-required TN format
- Matrix size: `8192`
- Warmup: `50`
- Iterations: `500`
- GPUs: single-node 8 GPUs, measured one GPU at a time
NVIDIA cuBLASLt documentation states FP8 kernels require TN format, `CUBLAS_COMPUTE_32F`, and `CUDA_R_32F` scale type. The implemented benchmark follows those constraints.
## Results
### aikubeworker0012 / nccl-gpu-1
Raw report: `reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json`
| GPU | FP8 TFLOPS |
|---:|---:|
| 0 | 1615.6 |
| 1 | 1611.0 |
| 2 | 1599.0 |
| 3 | 1607.1 |
| 4 | 1614.0 |
| 5 | 1604.4 |
| 6 | 1608.4 |
| 7 | 1609.1 |
Summary:
- Mean: `1608.6 TFLOPS`
- Min / Max: `1599.0 / 1615.6 TFLOPS`
- Spread: `1.03%`
- FP8 absolute threshold: `>= 1400 TFLOPS`
- Verdict against FP8 absolute threshold: **PASS**
- Verdict against 8-GPU consistency threshold `<= 3%`: **PASS**
### aikubeworker0016 / nccl-gpu-2
Raw report: `reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json`
| GPU | FP8 TFLOPS |
|---:|---:|
| 0 | 1602.3 |
| 1 | 1604.0 |
| 2 | 1616.9 |
| 3 | 1610.6 |
| 4 | 1620.5 |
| 5 | 1630.3 |
| 6 | 1605.1 |
| 7 | 1620.2 |
Summary:
- Mean: `1613.7 TFLOPS`
- Min / Max: `1602.3 / 1630.3 TFLOPS`
- Spread: `1.74%`
- FP8 absolute threshold: `>= 1400 TFLOPS`
- Verdict against FP8 absolute threshold: **PASS**
- Verdict against 8-GPU consistency threshold `<= 3%`: **PASS**
## Comparison With Existing PyTorch `_scaled_mm` Result
| Host | PyTorch `_scaled_mm` FP8 | cuBLASLt FP8 | Delta |
|---|---:|---:|---:|
| aikubeworker0012 | 1170.4 | 1608.6 | +438.2 |
| aikubeworker0016 | 1179.5 | 1613.7 | +434.2 |
The cuBLASLt path passes the `>= 1400 TFLOPS` FP8 absolute threshold on both machines, while the original PyTorch `_scaled_mm` path remains around `1170-1180 TFLOPS`.
## Conclusion
The FP8 hardware path is capable of exceeding the configured H100 FP8 acceptance threshold on both machines. The earlier FP8 FAIL is therefore most likely a benchmark implementation issue in the current PyTorch `_scaled_mm` path, not a GPU hardware, power, clock, thermal, MIG, ECC, or Fabric Manager issue.
Recommended next action: replace or augment the existing FP8 compute acceptance item with the cuBLASLt FP8 GEMM cross-check, while keeping the PyTorch `_scaled_mm` result as a secondary software-stack signal.

View File

@ -1,21 +0,0 @@
{
"source": "cuBLASLt",
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
"matrix_size": 8192,
"warmup": 50,
"iterations": 500,
"per_gpu": [
{"index": 0, "fp8_tflops": 1615.6},
{"index": 1, "fp8_tflops": 1611.0},
{"index": 2, "fp8_tflops": 1599.0},
{"index": 3, "fp8_tflops": 1607.1},
{"index": 4, "fp8_tflops": 1614.0},
{"index": 5, "fp8_tflops": 1604.4},
{"index": 6, "fp8_tflops": 1608.4},
{"index": 7, "fp8_tflops": 1609.1}
],
"mean_tflops": 1608.6,
"min_tflops": 1599.0,
"max_tflops": 1615.6,
"spread_pct": 1.03
}

View File

@ -1,21 +0,0 @@
{
"source": "cuBLASLt",
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
"matrix_size": 8192,
"warmup": 50,
"iterations": 500,
"per_gpu": [
{"index": 0, "fp8_tflops": 1602.3},
{"index": 1, "fp8_tflops": 1604.0},
{"index": 2, "fp8_tflops": 1616.9},
{"index": 3, "fp8_tflops": 1610.6},
{"index": 4, "fp8_tflops": 1620.5},
{"index": 5, "fp8_tflops": 1630.3},
{"index": 6, "fp8_tflops": 1605.1},
{"index": 7, "fp8_tflops": 1620.2}
],
"mean_tflops": 1613.7,
"min_tflops": 1602.3,
"max_tflops": 1630.3,
"spread_pct": 1.74
}

View File

@ -1,169 +0,0 @@
# FP8 GEMM 路径对比测试报告
测试日期2026-05-25
测试节点aikubeworker0012、aikubeworker0016
测试 GPUNVIDIA H100 80GB HBM3
测试目标:对比同一 FP8 GEMM 规模下 PyTorch eager、CUDA Graph、Transformer Engine 和 direct cuBLASLt 的性能差异。
## 一、测试结论
本次 A-E 五条路径均已完成实测。
核心结论:
1. direct cuBLASLt 是本组测试里最快路径,两台机器分别达到 1626.6 TFLOPS 和 1598.1 TFLOPS。
2. PyTorch eager `_scaled_mm` 默认路径约为 1161.9-1186.1 TFLOPS。
3. 打开 `use_fast_accum=True`PyTorch eager 路径有稳定提升,约提升 5.0%-6.7%。
4. CUDA Graph + `_scaled_mm(use_fast_accum=True)` 进一步提升到 1277.7-1322.2 TFLOPS但仍低于 direct cuBLASLt。
5. Transformer Engine 本次使用的是 `te.Linear` + `fp8_autocast` 路径,不是裸 GEMM因此包含 TE module、cast、FP8 recipe 等额外开销,结果低于 direct cuBLASLt也低于 CUDA Graph `_scaled_mm`
这说明:当前 GPU 硬件和 cuBLASLt 裸 GEMM 能力本身没有问题;之前 PyTorch `_scaled_mm` 1170-1180 TFLOPS 左右的结果,主要反映的是 PyTorch eager 路径和当前 benchmark 方式下的端到端路径性能,而不是 GPU 算力极限。
## 二、测试方法
统一参数:
| 参数 | 值 |
|---|---:|
| matrix_size | 8192 |
| M/N/K | 8192/8192/8192 |
| warmup | 50 |
| iterations | 500 |
| GPU index | 0 |
| PyTorch | 2.6.0+cu124 |
| CUDA | 12.4 |
| 输入 dtype | FP8 E4M3 |
| 输出 dtype | BF16 |
| accumulation | FP32 |
| scale_a / scale_b | 1.0 / 1.0 |
测试路径定义:
| 路径 | 名称 | 含义 |
|---|---|---|
| A | 当前 eager `_scaled_mm` | PyTorch 立即执行模式调用 `torch._scaled_mm`,默认 accumulation 参数 |
| B | `_scaled_mm(use_fast_accum=True)` | PyTorch eager 路径,但显式打开 fast accumulation |
| C | CUDA Graph + `_scaled_mm(use_fast_accum=True)` | 捕获并 replay 同一个 `_scaled_mm` 调用,降低 Python/PyTorch launch 间隙 |
| D | Transformer Engine FP8 GEMM | `te.Linear``fp8_autocast` 下执行,包含 TE 层封装和 FP8 recipe 开销 |
| E | direct cuBLASLt | C++/CUDA 直接调用 `cublasLtMatmul`,绕过 PyTorch eager |
复现脚本:
```bash
MATRIX_SIZE=8192 WARMUP=50 ITERATIONS=500 GPU_INDEX=0 WORKSPACE_MB=256 \
/root/test_gpu_scripts/scripts/run_fp8_path_comparison.sh
```
## 三、实测结果
### aikubeworker0012
原始 JSON`/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json`
| 路径 | 状态 | TFLOPS | 单轮 CUDA event 时间 |
|---|---|---:|---:|
| A eager `_scaled_mm` default | OK | 1186.1 | 927.014 us |
| B eager `_scaled_mm` fast_accum | OK | 1266.0 | 868.481 us |
| C CUDA Graph + fast_accum | OK | 1322.2 | 831.573 us |
| D Transformer Engine FP8 Linear | OK | 1153.2 | 953.478 us |
| E direct cuBLASLt fast_accum | OK | 1626.6 | 未在 combined JSON 中记录 |
相对 A 的提升:
| 路径 | 相对 A |
|---|---:|
| B | +6.7% |
| C | +11.5% |
| D | -2.8% |
| E | +37.1% |
E 路径 cuBLASLt 算法信息:
| 字段 | 值 |
|---|---:|
| algo_id | 52 |
| tile_id | 23 |
| splitk | 1 |
| stages_id | 36 |
| inner_shape_id | 0 |
| cluster_shape_id | 3 |
### aikubeworker0016
原始 JSON`/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json`
| 路径 | 状态 | TFLOPS | 单轮 CUDA event 时间 |
|---|---|---:|---:|
| A eager `_scaled_mm` default | OK | 1161.9 | 946.313 us |
| B eager `_scaled_mm` fast_accum | OK | 1220.4 | 900.960 us |
| C CUDA Graph + fast_accum | OK | 1277.7 | 860.543 us |
| D Transformer Engine FP8 Linear | OK | 1125.3 | 977.054 us |
| E direct cuBLASLt fast_accum | OK | 1598.1 | 未在 combined JSON 中记录 |
相对 A 的提升:
| 路径 | 相对 A |
|---|---:|
| B | +5.0% |
| C | +10.0% |
| D | -3.2% |
| E | +37.5% |
E 路径 cuBLASLt 算法信息:
| 字段 | 值 |
|---|---:|
| algo_id | 52 |
| tile_id | 23 |
| splitk | 1 |
| stages_id | 36 |
| inner_shape_id | 0 |
| cluster_shape_id | 3 |
## 四、对 PyTorch FP8 能否“上去”的判断
从本次结果看PyTorch FP8 路径可以通过两类方式上去:
1. 打开更快的 math/accumulation 参数,例如 `use_fast_accum=True`
2. 使用 CUDA Graph replay减少 eager 模式下每轮调度、enqueue 之间的间隙。
但在当前 `matrix_size=8192`、单个 `_scaled_mm`、PyTorch eager/Graph benchmark 的测试形态下PyTorch 路径仍没有达到 direct cuBLASLt 的 1598-1626 TFLOPS。也就是说direct cuBLASLt 证明硬件和底层库有能力跑得更高PyTorch eager `_scaled_mm` 测到的是 PyTorch 当前封装路径在这个 shape 下的实际表现。
如果把目标定义为“让 PyTorch 代码路径更接近裸 cuBLASLt”后续可以继续验证
1. 更大的 GEMM size例如 16384。
2. 固定 shape 后用 `torch.compile` 或 Inductor。
3. CUDA Graph 覆盖更完整的 step而不是只 replay 单个 op。
4. 使用 Transformer Engine 的更底层 GEMM API 或官方 microbenchmark而不是 `te.Linear` module forward。
5. 对 `_scaled_mm` 做 Nsight Systems / Nsight Compute 抓取,确认实际 kernel、间隙和 cuBLASLt 算法选择。
## 五、术语说明
`eager` 指 PyTorch 立即执行模式。每次 Python 调用 `torch._scaled_mm`PyTorch 都会经过 dispatcher、参数检查、Tensor 创建、准备 descriptor、调用 cuBLASLt heuristic然后把 matmul enqueue 到 CUDA stream。
`cuBLAS` 是 NVIDIA 的基础矩阵乘库。`cuBLASLt` 是更灵活的矩阵乘接口,支持更多 layout、FP8、算法 heuristic、workspace、epilogue 等能力。
`direct cuBLASLt` 指我们自己写 C++/CUDA 直接调用 `cublasLtMatmul`,不经过 PyTorch eager因此更接近裸 GEMM 峰值。
`CUDA Graph` 指把一次 CUDA work 提前捕获成图,后续直接 replay减少 CPU 侧反复 launch/调度带来的间隙。
`Transformer Engine` 是 NVIDIA 面向 Transformer/FP8 训练优化的库。本次 D 路径使用的是 `te.Linear` module forward不等同于裸 GEMM microbenchmark。
## 六、文件清单
本地脚本:
| 文件 | 用途 |
|---|---|
| `/Users/d-robotics/lab/test_gpu_scripts/scripts/pytorch_fp8_path_bench.py` | A/B/C/D PyTorch 与 Transformer Engine 路径 |
| `/Users/d-robotics/lab/test_gpu_scripts/scripts/cublaslt_fp8_gemm_bench.cu` | E direct cuBLASLt 路径 |
| `/Users/d-robotics/lab/test_gpu_scripts/scripts/run_fp8_path_comparison.sh` | 统一运行并合并 A-E 结果 |
本地结果:
| 文件 | 用途 |
|---|---|
| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json` | aikubeworker0012 A-E 原始结果 |
| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json` | aikubeworker0016 A-E 原始结果 |
| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_path_comparison_20260525.md` | 本中文汇总报告 |

View File

@ -1,142 +0,0 @@
{
"source": "fp8_path_comparison",
"host": null,
"matrix_size": 8192,
"gpu_index": 0,
"pytorch": {
"source": "pytorch_fp8_path_bench",
"torch": "2.6.0+cu124",
"cuda": "12.4",
"gpu_index": 0,
"gpu_name": "NVIDIA H100 80GB HBM3",
"matrix_size": 8192,
"warmup": 50,
"iterations": 500,
"results": [
{
"name": "A_eager_scaled_mm_default",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 465.145,
"event_us_per_iter": 930.29,
"wall_ms_total": 465.21,
"tflops": 1181.9
},
{
"name": "B_eager_scaled_mm_fast_accum",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 440.252,
"event_us_per_iter": 880.504,
"wall_ms_total": 440.289,
"tflops": 1248.7
},
{
"name": "C_cuda_graph_scaled_mm_fast_accum",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 3,
"event_ms_total": 415.631,
"event_us_per_iter": 831.262,
"wall_ms_total": 415.664,
"tflops": 1322.7
},
{
"name": "D_transformer_engine_fp8_linear",
"status": "unavailable",
"reason": "ModuleNotFoundError: No module named 'transformer_engine'"
}
],
"summary": {
"max_tflops": 1322.7,
"min_tflops": 1181.9,
"mean_tflops": 1251.1
}
},
"cublaslt": {
"source": "cuBLASLt",
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
"matrix_size": 8192,
"warmup": 50,
"iterations": 500,
"fast_accum": 1,
"per_gpu": [
{
"index": 0,
"fp8_tflops": 1615.4,
"algo_id": 52,
"tile_id": 23,
"splitk": 1,
"stages_id": 36,
"inner_shape_id": 0,
"cluster_shape_id": 3
}
],
"mean_tflops": 1615.4,
"min_tflops": 1615.4,
"max_tflops": 1615.4,
"spread_pct": 0.0
},
"results": [
{
"name": "A_eager_scaled_mm_default",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 465.145,
"event_us_per_iter": 930.29,
"wall_ms_total": 465.21,
"tflops": 1181.9
},
{
"name": "B_eager_scaled_mm_fast_accum",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 440.252,
"event_us_per_iter": 880.504,
"wall_ms_total": 440.289,
"tflops": 1248.7
},
{
"name": "C_cuda_graph_scaled_mm_fast_accum",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 3,
"event_ms_total": 415.631,
"event_us_per_iter": 831.262,
"wall_ms_total": 415.664,
"tflops": 1322.7
},
{
"name": "D_transformer_engine_fp8_linear",
"status": "unavailable",
"reason": "ModuleNotFoundError: No module named 'transformer_engine'"
},
{
"index": 0,
"algo_id": 52,
"tile_id": 23,
"splitk": 1,
"stages_id": 36,
"inner_shape_id": 0,
"cluster_shape_id": 3,
"name": "E_direct_cublaslt_fast_accum",
"status": "ok",
"tflops": 1615.4,
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"fast_accum": 1,
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
}
]
}

View File

@ -1,156 +0,0 @@
{
"source": "fp8_path_comparison",
"host": null,
"matrix_size": 8192,
"gpu_index": 0,
"pytorch": {
"source": "pytorch_fp8_path_bench",
"torch": "2.6.0+cu124",
"cuda": "12.4",
"gpu_index": 0,
"gpu_name": "NVIDIA H100 80GB HBM3",
"matrix_size": 8192,
"warmup": 50,
"iterations": 500,
"results": [
{
"name": "A_eager_scaled_mm_default",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 463.507,
"event_us_per_iter": 927.014,
"wall_ms_total": 463.573,
"tflops": 1186.1
},
{
"name": "B_eager_scaled_mm_fast_accum",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 434.241,
"event_us_per_iter": 868.481,
"wall_ms_total": 434.492,
"tflops": 1266.0
},
{
"name": "C_cuda_graph_scaled_mm_fast_accum",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 3,
"event_ms_total": 415.786,
"event_us_per_iter": 831.573,
"wall_ms_total": 415.825,
"tflops": 1322.2
},
{
"name": "D_transformer_engine_fp8_linear",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 476.739,
"event_us_per_iter": 953.478,
"wall_ms_total": 476.8,
"tflops": 1153.2,
"note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
}
],
"summary": {
"max_tflops": 1322.2,
"min_tflops": 1153.2,
"mean_tflops": 1231.9
}
},
"cublaslt": {
"source": "cuBLASLt",
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
"matrix_size": 8192,
"warmup": 50,
"iterations": 500,
"fast_accum": 1,
"per_gpu": [
{
"index": 0,
"fp8_tflops": 1626.6,
"algo_id": 52,
"tile_id": 23,
"splitk": 1,
"stages_id": 36,
"inner_shape_id": 0,
"cluster_shape_id": 3
}
],
"mean_tflops": 1626.6,
"min_tflops": 1626.6,
"max_tflops": 1626.6,
"spread_pct": 0.0
},
"results": [
{
"name": "A_eager_scaled_mm_default",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 463.507,
"event_us_per_iter": 927.014,
"wall_ms_total": 463.573,
"tflops": 1186.1
},
{
"name": "B_eager_scaled_mm_fast_accum",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 434.241,
"event_us_per_iter": 868.481,
"wall_ms_total": 434.492,
"tflops": 1266.0
},
{
"name": "C_cuda_graph_scaled_mm_fast_accum",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 3,
"event_ms_total": 415.786,
"event_us_per_iter": 831.573,
"wall_ms_total": 415.825,
"tflops": 1322.2
},
{
"name": "D_transformer_engine_fp8_linear",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 476.739,
"event_us_per_iter": 953.478,
"wall_ms_total": 476.8,
"tflops": 1153.2,
"note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
},
{
"index": 0,
"algo_id": 52,
"tile_id": 23,
"splitk": 1,
"stages_id": 36,
"inner_shape_id": 0,
"cluster_shape_id": 3,
"name": "E_direct_cublaslt_fast_accum",
"status": "ok",
"tflops": 1626.6,
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"fast_accum": 1,
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
}
]
}

View File

@ -1,142 +0,0 @@
{
"source": "fp8_path_comparison",
"host": null,
"matrix_size": 8192,
"gpu_index": 0,
"pytorch": {
"source": "pytorch_fp8_path_bench",
"torch": "2.6.0+cu124",
"cuda": "12.4",
"gpu_index": 0,
"gpu_name": "NVIDIA H100 80GB HBM3",
"matrix_size": 8192,
"warmup": 50,
"iterations": 500,
"results": [
{
"name": "A_eager_scaled_mm_default",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 470.909,
"event_us_per_iter": 941.817,
"wall_ms_total": 470.974,
"tflops": 1167.4
},
{
"name": "B_eager_scaled_mm_fast_accum",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 452.608,
"event_us_per_iter": 905.215,
"wall_ms_total": 452.647,
"tflops": 1214.6
},
{
"name": "C_cuda_graph_scaled_mm_fast_accum",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 3,
"event_ms_total": 427.724,
"event_us_per_iter": 855.449,
"wall_ms_total": 427.768,
"tflops": 1285.3
},
{
"name": "D_transformer_engine_fp8_linear",
"status": "unavailable",
"reason": "ModuleNotFoundError: No module named 'transformer_engine'"
}
],
"summary": {
"max_tflops": 1285.3,
"min_tflops": 1167.4,
"mean_tflops": 1222.4
}
},
"cublaslt": {
"source": "cuBLASLt",
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
"matrix_size": 8192,
"warmup": 50,
"iterations": 500,
"fast_accum": 1,
"per_gpu": [
{
"index": 0,
"fp8_tflops": 1594.3,
"algo_id": 52,
"tile_id": 23,
"splitk": 1,
"stages_id": 36,
"inner_shape_id": 0,
"cluster_shape_id": 3
}
],
"mean_tflops": 1594.3,
"min_tflops": 1594.3,
"max_tflops": 1594.3,
"spread_pct": 0.0
},
"results": [
{
"name": "A_eager_scaled_mm_default",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 470.909,
"event_us_per_iter": 941.817,
"wall_ms_total": 470.974,
"tflops": 1167.4
},
{
"name": "B_eager_scaled_mm_fast_accum",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 452.608,
"event_us_per_iter": 905.215,
"wall_ms_total": 452.647,
"tflops": 1214.6
},
{
"name": "C_cuda_graph_scaled_mm_fast_accum",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 3,
"event_ms_total": 427.724,
"event_us_per_iter": 855.449,
"wall_ms_total": 427.768,
"tflops": 1285.3
},
{
"name": "D_transformer_engine_fp8_linear",
"status": "unavailable",
"reason": "ModuleNotFoundError: No module named 'transformer_engine'"
},
{
"index": 0,
"algo_id": 52,
"tile_id": 23,
"splitk": 1,
"stages_id": 36,
"inner_shape_id": 0,
"cluster_shape_id": 3,
"name": "E_direct_cublaslt_fast_accum",
"status": "ok",
"tflops": 1594.3,
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"fast_accum": 1,
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
}
]
}

View File

@ -1,156 +0,0 @@
{
"source": "fp8_path_comparison",
"host": null,
"matrix_size": 8192,
"gpu_index": 0,
"pytorch": {
"source": "pytorch_fp8_path_bench",
"torch": "2.6.0+cu124",
"cuda": "12.4",
"gpu_index": 0,
"gpu_name": "NVIDIA H100 80GB HBM3",
"matrix_size": 8192,
"warmup": 50,
"iterations": 500,
"results": [
{
"name": "A_eager_scaled_mm_default",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 473.156,
"event_us_per_iter": 946.313,
"wall_ms_total": 473.199,
"tflops": 1161.9
},
{
"name": "B_eager_scaled_mm_fast_accum",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 450.48,
"event_us_per_iter": 900.96,
"wall_ms_total": 450.505,
"tflops": 1220.4
},
{
"name": "C_cuda_graph_scaled_mm_fast_accum",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 3,
"event_ms_total": 430.272,
"event_us_per_iter": 860.543,
"wall_ms_total": 430.304,
"tflops": 1277.7
},
{
"name": "D_transformer_engine_fp8_linear",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 488.527,
"event_us_per_iter": 977.054,
"wall_ms_total": 488.576,
"tflops": 1125.3,
"note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
}
],
"summary": {
"max_tflops": 1277.7,
"min_tflops": 1125.3,
"mean_tflops": 1196.3
}
},
"cublaslt": {
"source": "cuBLASLt",
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
"matrix_size": 8192,
"warmup": 50,
"iterations": 500,
"fast_accum": 1,
"per_gpu": [
{
"index": 0,
"fp8_tflops": 1598.1,
"algo_id": 52,
"tile_id": 23,
"splitk": 1,
"stages_id": 36,
"inner_shape_id": 0,
"cluster_shape_id": 3
}
],
"mean_tflops": 1598.1,
"min_tflops": 1598.1,
"max_tflops": 1598.1,
"spread_pct": 0.0
},
"results": [
{
"name": "A_eager_scaled_mm_default",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 473.156,
"event_us_per_iter": 946.313,
"wall_ms_total": 473.199,
"tflops": 1161.9
},
{
"name": "B_eager_scaled_mm_fast_accum",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 450.48,
"event_us_per_iter": 900.96,
"wall_ms_total": 450.505,
"tflops": 1220.4
},
{
"name": "C_cuda_graph_scaled_mm_fast_accum",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 3,
"event_ms_total": 430.272,
"event_us_per_iter": 860.543,
"wall_ms_total": 430.304,
"tflops": 1277.7
},
{
"name": "D_transformer_engine_fp8_linear",
"status": "ok",
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"event_ms_total": 488.527,
"event_us_per_iter": 977.054,
"wall_ms_total": 488.576,
"tflops": 1125.3,
"note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
},
{
"index": 0,
"algo_id": 52,
"tile_id": 23,
"splitk": 1,
"stages_id": 36,
"inner_shape_id": 0,
"cluster_shape_id": 3,
"name": "E_direct_cublaslt_fast_accum",
"status": "ok",
"tflops": 1598.1,
"matrix_size": 8192,
"iterations": 500,
"warmup": 50,
"fast_accum": 1,
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
}
]
}

View File

@ -1,152 +0,0 @@
# GPU_Test 合并报告
- **日期:** 2026-05-24
- **节点:** `aikubeworker0012 / 172.72.8.12``aikubeworker0016 / 172.72.8.16`
- **GPU:** NVIDIA H100 80GB HBM3 x8 / node
- **范围:** 单机单卡算力与多机多卡 NCCL 通信
- **说明:** 本报告汇总既有原始测试结果,不重新启动额外压力测试。
## 总体结论
| 测试项 | 结论 | 说明 |
|---|---|---|
| 单机 GPU 识别 | PASS | 两台机器均识别 8 张 H100 80GB HBM3 |
| 单机单卡 FP8 硬件算力 | PASS | direct cuBLASLt FP8 GEMM 两台机器均超过 `>= 1400 TFLOPS` |
| PyTorch `_scaled_mm` FP8 路径 | FAIL / 软件栈信号 | 约 `1170-1180 TFLOPS`,低于阈值;已定位为 PyTorch eager / `_scaled_mm` benchmark 路径偏低,不作为硬件失败依据 |
| 多机多卡 NCCL 正确性 | PASS | return code `0``Wrong=0` / `Out of bounds values: 0 OK` |
| 多机多卡 NCCL 性能 | 符合当前 4x400Gbps 网络形态 | 2x8 allreduce / alltoall 低于 PDF 8x400Gbps 阈值,但该阈值不应直接硬套到当前 4x400Gbps 环境 |
## 单机单卡 / 算力测试
### 机器信息
| Host | GPU | Driver | CUDA | GPU 数量 |
|---|---|---|---|---:|
| `aikubeworker0012` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
| `aikubeworker0016` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
来源:
- `reports_single_gpu_aikubeworker0012.md`
- `reports_single_gpu_aikubeworker0016.md`
### 原始 PyTorch 单机算力结果
| Host | FP32 | TF32 | FP16 | BF16 | FP8 `_scaled_mm` | 原始 Verdict |
|---|---:|---:|---:|---:|---:|---|
| `aikubeworker0012` | 52.0 | 362.3 | 691.0 | 713.0 | 1148.8 | FAIL |
| `aikubeworker0016` | 51.9 | 357.8 | 667.2 | 699.1 | 1146.2 | FAIL |
原始 PyTorch 路径使用 `torch._scaled_mm` 做 FP8 GEMM。后续复查显示该路径会受到 PyTorch eager dispatch、输出 Tensor 创建、cuBLASLt heuristic 路径、默认 `use_fast_accum=False` 等因素影响,不能直接代表 H100 FP8 Tensor Core 硬件上限。
### direct cuBLASLt FP8 GEMM 交叉验证
测试参数:
| 参数 | 值 |
|---|---|
| Benchmark | direct cuBLASLt FP8 GEMM |
| Source | `scripts/cublaslt_fp8_gemm_bench.cu` |
| Matrix | `8192 x 8192 x 8192` |
| A/B dtype | FP8 E4M3 |
| Output dtype | BF16 |
| Compute type | `CUBLAS_COMPUTE_32F` |
| Scale type | `CUDA_R_32F` |
| Scale A/B | `1.0` |
| Layout | TN |
| fast accumulation | enabled |
| Threshold | `>= 1400 TFLOPS` |
结果:
| Host | Mean FP8 TFLOPS | Min | Max | Spread | Threshold | Verdict |
|---|---:|---:|---:|---:|---:|---|
| `aikubeworker0012` | 1608.6 | 1599.0 | 1615.6 | 1.03% | >= 1400 | PASS |
| `aikubeworker0016` | 1613.7 | 1602.3 | 1630.3 | 1.74% | >= 1400 | PASS |
单卡逐张结果:
| Host | GPU0 | GPU1 | GPU2 | GPU3 | GPU4 | GPU5 | GPU6 | GPU7 |
|---|---:|---:|---:|---:|---:|---:|---:|---:|
| `aikubeworker0012` | 1615.6 | 1611.0 | 1599.0 | 1607.1 | 1614.0 | 1604.4 | 1608.4 | 1609.1 |
| `aikubeworker0016` | 1602.3 | 1604.0 | 1616.9 | 1610.6 | 1620.5 | 1630.3 | 1605.1 | 1620.2 |
结论direct cuBLASLt FP8 GEMM 已通过 `>= 1400 TFLOPS` 阈值,说明两台机器的 FP8 硬件计算路径具备达标能力。PyTorch `_scaled_mm` 的 FAIL 更适合作为软件栈 benchmark 路径问题记录,而不是 GPU 硬件失败结论。
来源:
- `reports_cublaslt_fp8_crosscheck_20260524.md`
- `reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json`
- `reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json`
## 多机多卡 NCCL 测试
### 测试环境
| 项目 | 结果 |
|---|---|
| Hosts | `nccl-gpu-1(172.72.8.12)``nccl-gpu-2(172.72.8.16)` |
| Topology | 2 nodes x 8 GPUs合计 16 GPUs |
| NCCL source | `nccl-tests-mpirun` |
| NCCL network | IB |
| GPU Direct RDMA | ENABLED |
| Active HCA rails | `mlx5_0, mlx5_1, mlx5_6, mlx5_7` |
| HCA speed | 4 条 `400 Gb/sec (4X NDR)` ACTIVE |
注意NCCL 表里的 `GB/s` 是大 B即 Bytes/s。IB 网卡口径 `400 Gb/s` 是小 b即 bits/s。
### 2x8 全集合通信结果
| Operation | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Correctness | 当前 4x400Gbps 口径 |
|---|---:|---:|---:|---|---|
| allreduce | 354.27 GB/s | 354.45 GB/s | >= 491.84 GB/s | PASS | 符合当前硬件形态,低于 PDF 8 rail 阈值 |
| alltoall | 37.00 GB/s | 37.14 GB/s | >= 76.54 GB/s | PASS | 符合当前硬件形态,低于 PDF 8 rail 阈值 |
| broadcast | 191.65 GB/s | 190.25 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
| reducescatter | 192.75 GB/s | 192.74 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
| allgather | 192.14 GB/s | 192.47 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
| sendrecv | 26.98 GB/s | 26.97 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
结论2x8 全集合通信测试中NCCL 正确性通过。allreduce 和 alltoall 低于 PDF 8x400Gbps 参考阈值,但当前机器确认参与 NCCL 的是 4 条 400Gbps rail因此该差距不应直接判定为当前 4x400Gbps 环境不合格。
来源:
- `reports_multinode_nccl_all_collectives_20260523_120144.md`
- `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md`
### PDF Matrix allreduce / alltoall 结果
AllReducePDF 8x400Gbps 阈值对比,仅作参考):
| Topology | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Gap | 当前解释 |
|---|---:|---:|---:|---:|---|
| 2 nodes x 1 GPU | 47.29 GB/s | 47.26 GB/s | >= 48.90 GB/s | -1.61 GB/s | 接近 PDF 阈值 |
| 2 nodes x 2 GPUs | 137.16 GB/s | 137.13 GB/s | >= 136.93 GB/s | +0.23 GB/s | 达到 PDF 阈值 |
| 2 nodes x 4 GPUs | 335.07 GB/s | 335.02 GB/s | >= 335.48 GB/s | -0.41 GB/s | 接近 PDF 阈值 |
| 2 nodes x 8 GPUs | 353.85 GB/s | 353.85 GB/s | >= 491.84 GB/s | -137.99 GB/s | 低于 PDF 8 rail 阈值;当前为 4 rail 环境,不直接判不合格 |
AllToAllPDF 8x400Gbps 阈值对比,仅作参考):
| Topology | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Gap | 当前解释 |
|---|---:|---:|---:|---:|---|
| 2 nodes x 1 GPU | 24.85 GB/s | 24.90 GB/s | >= 27.25 GB/s | -2.40 GB/s | 接近 PDF 阈值 |
| 2 nodes x 2 GPUs | 47.76 GB/s | 47.98 GB/s | >= 54.41 GB/s | -6.65 GB/s | 低于 PDF 8 rail 阈值 |
| 2 nodes x 4 GPUs | 72.74 GB/s | 72.80 GB/s | >= 73.73 GB/s | -0.99 GB/s | 接近 PDF 阈值 |
| 2 nodes x 8 GPUs | 36.83 GB/s | 36.85 GB/s | >= 76.54 GB/s | -39.71 GB/s | 低于 PDF 8 rail 阈值;当前为 4 rail 环境,不直接判不合格 |
来源:
- `reports_multinode_nccl_pdf_matrix_run_20260523.md`
- `reports_multinode_nccl_pdf_matrix_20260523_113803.md`
## 风险与判断
1. 单机 FP8 硬件能力通过 direct cuBLASLt 验证,当前不支持将 PyTorch `_scaled_mm` FAIL 直接判定为 GPU 硬件故障。
2. 多机 NCCL 正确性通过,性能结果应按当前 4x400Gbps rail 环境解释。
3. 当前多机环境确认参与 NCCL 的是 4 条 400G IB railPDF 参考环境为 8x400G 计算管理网络,因此 2x8 阈值与当前硬件形态不等价。
4. 2x8 allreduce 和 alltoall 低于 PDF 8 rail 阈值,建议作为“与 PDF 参考环境差异”记录,而不是作为当前 4 rail 环境不合格结论。
## 建议
1. 单机 FP8 验收以 direct cuBLASLt 或 Transformer Engine GEMM benchmark 为主PyTorch `_scaled_mm` 作为软件栈参考项保留。
2. 多机 NCCL 后续若要按 PDF 阈值验收,需要先对齐 PDF 参考环境的 8x400Gbps rail 数量、NCCL net plugin / SHARP、跨 Leaf 交换策略、ECMP / 拥塞控制配置。
3. 对外报告建议明确区分 `GB/s``Gb/s`NCCL bus bandwidth 是大 BIB 端口速率是小 b。

View File

@ -1,123 +0,0 @@
# GPU_Test 双节点测试报告
- **测试日期:** 2026-05-24
- **测试节点:** `aikubeworker0012 / 172.72.8.12``aikubeworker0016 / 172.72.8.16`
- **节点配置:** 每节点 8 张 NVIDIA H100 80GB HBM3 GPU
- **测试范围:** 单机算力、单机 8 卡通信、多机 2x8 GPU 通信
- **网络形态:** 当前参与 NCCL 的计算网络为 4 条 400Gbps IB rail
## 结论摘要
| 项目 | 结果摘要 |
|---|---|
| GPU 识别 | 两台节点均识别 8 张 H100 80GB HBM3 GPU |
| 单机 FP8 GEMM | 两台节点 direct cuBLASLt FP8 GEMM 均超过 1600 TFLOPS |
| 单机 8 卡 NCCL | 两台节点单机 8 卡 NCCL 集合通信均可正常完成,主要大包通信带宽稳定 |
| 多机 2x8 NCCL | 两节点 16 GPU NCCL 正确性通过,所有测试 `Wrong=0` / return code `0` |
| 多机网络口径 | 当前为 4x400Gbps IB rail 环境,结果按该硬件形态解释 |
## 测试环境
| Host | GPU | Driver | CUDA | GPU 数量 |
|---|---|---|---|---:|
| `aikubeworker0012` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
| `aikubeworker0016` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
## 单机算力测试
### FP8 GEMM 硬件路径验证
本项使用 direct cuBLASLt FP8 GEMM benchmark绕过 PyTorch eager 调度路径,直接验证 GPU FP8 Tensor Core 与 cuBLASLt GEMM 能力。
| 参数 | 配置 |
|---|---|
| GEMM shape | `8192 x 8192 x 8192` |
| 输入类型 | FP8 E4M3 |
| 输出类型 | BF16 |
| 累加类型 | FP32 compute |
| Layout | TN |
| Scale | `scale_a = 1.0``scale_b = 1.0` |
| fast accumulation | enabled |
| 测试 GPU | 每节点 8 张 GPU 逐张测试 |
| Host | Mean FP8 TFLOPS | Min | Max | Spread |
|---|---:|---:|---:|---:|
| `aikubeworker0012` | 1608.6 | 1599.0 | 1615.6 | 1.03% |
| `aikubeworker0016` | 1613.7 | 1602.3 | 1630.3 | 1.74% |
| Host | GPU0 | GPU1 | GPU2 | GPU3 | GPU4 | GPU5 | GPU6 | GPU7 |
|---|---:|---:|---:|---:|---:|---:|---:|---:|
| `aikubeworker0012` | 1615.6 | 1611.0 | 1599.0 | 1607.1 | 1614.0 | 1604.4 | 1608.4 | 1609.1 |
| `aikubeworker0016` | 1602.3 | 1604.0 | 1616.9 | 1610.6 | 1620.5 | 1630.3 | 1605.1 | 1620.2 |
**说明:** PyTorch `_scaled_mm` eager benchmark 结果约为 1170-1180 TFLOPS该结果反映 PyTorch 软件路径与调度开销,不作为本报告的硬件算力结论。
## 单机 8 卡 NCCL 通信测试
本项在单个节点内使用 8 张 GPU 进行 NCCL 集合通信测试,结果单位为 `GB/s`,即 Bytes/s。
| Operation | `aikubeworker0012` Bus BW | `aikubeworker0016` Bus BW |
|---|---:|---:|
| allreduce | 472.3 GB/s | 472.4 GB/s |
| alltoall | 343.3 GB/s | 344.3 GB/s |
| broadcast | 364.1 GB/s | 363.6 GB/s |
| reducescatter | 352.8 GB/s | 353.1 GB/s |
| allgather | 366.4 GB/s | 366.4 GB/s |
| sendrecv | 369.0 GB/s | 368.9 GB/s |
**说明:** 单机 8 卡通信主要依赖节点内 GPU 互联与 NCCL collective 实现。两台节点的同类 operation 结果接近,节点间差异较小。
## 多机 2x8 NCCL 通信测试
本项使用两台节点,每台 8 张 GPU共 16 张 GPU 进行跨节点 NCCL 集合通信测试。
### 网络环境
| 项目 | 配置 |
|---|---|
| Host A | `aikubeworker0012 / 172.72.8.12` |
| Host B | `aikubeworker0016 / 172.72.8.16` |
| 拓扑 | 2 nodes x 8 GPUs |
| NCCL network | IB |
| GPU Direct RDMA | ENABLED |
| Active rails | `mlx5_0, mlx5_1, mlx5_6, mlx5_7` |
| Rail 速率 | 4 条 `400 Gb/sec (4X NDR)` ACTIVE |
### 跨节点 NCCL 结果
| Operation | Peak Bus BW | Avg Bus BW | Correctness |
|---|---:|---:|---|
| allreduce | 354.27 GB/s | 354.45 GB/s | PASS |
| alltoall | 37.00 GB/s | 37.14 GB/s | PASS |
| broadcast | 191.65 GB/s | 190.25 GB/s | PASS |
| reducescatter | 192.75 GB/s | 192.74 GB/s | PASS |
| allgather | 192.14 GB/s | 192.47 GB/s | PASS |
| sendrecv | 26.98 GB/s | 26.97 GB/s | PASS |
**正确性:** 本轮多机 NCCL 测试 return code 为 `0``Wrong=0`,未发现数据正确性错误。
## 单位说明
| 写法 | 含义 | 说明 |
|---|---|---|
| `GB/s` | Gigabytes per second | 大 B字节每秒NCCL bus bandwidth 使用此单位 |
| `Gbps` / `Gb/s` | Gigabits per second | 小 b比特每秒IB 端口速率通常使用此单位 |
换算关系:
```text
1 Byte = 8 bits
400 Gb/s = 50 GB/s
4 x 400 Gb/s = 1600 Gb/s = 200 GB/s 物理链路字节带宽
```
NCCL 的 `busbw` 是 collective 通信的逻辑折算带宽,不等同于单条物理链路的线速。
## 结果说明
1. 两台节点 GPU 识别正常,均为 8 张 H100 80GB HBM3。
2. direct cuBLASLt FP8 GEMM 显示两台节点单卡 FP8 算力均超过 1600 TFLOPSGPU FP8 硬件计算路径正常。
3. 单机 8 卡 NCCL 通信在两台节点上结果接近,未观察到明显节点间异常差异。
4. 多机 2x8 NCCL 正确性通过,跨节点通信功能正常。
5. 当前多机通信结果应按 4x400Gbps IB rail 环境解释;若后续需要对齐 8x400Gbps 环境,应先确认 rail 数量、NCCL net plugin / SHARP、交换网络策略等配置一致。

View File

@ -1,102 +0,0 @@
@page {
size: A4 landscape;
margin: 13mm;
}
body {
color: #111827;
font-family: "PingFang SC", "Heiti SC", "Arial Unicode MS", sans-serif;
font-size: 11px;
line-height: 1.45;
}
h1 {
color: #0f172a;
font-size: 24px;
margin: 0 0 14px;
}
h2 {
border-bottom: 1px solid #cbd5e1;
color: #0f172a;
font-size: 17px;
margin: 24px 0 10px;
padding-bottom: 4px;
}
h3 {
color: #1f2937;
font-size: 13px;
margin: 16px 0 8px;
}
p {
margin: 7px 0;
}
code {
background: #f1f5f9;
border-radius: 3px;
color: #0f172a;
font-family: Menlo, Consolas, monospace;
font-size: 10px;
padding: 1px 3px;
}
pre {
background: #f8fafc;
border: 1px solid #e2e8f0;
border-radius: 4px;
padding: 8px;
white-space: pre-wrap;
}
table {
border-collapse: collapse;
margin: 8px 0 14px;
page-break-inside: auto;
width: 100%;
}
thead {
display: table-header-group;
}
tr {
page-break-inside: avoid;
}
th,
td {
border: 1px solid #cbd5e1;
padding: 5px 6px;
text-align: left;
vertical-align: middle;
word-break: break-word;
}
th {
background: #e2e8f0;
color: #0f172a;
font-weight: 700;
}
tbody tr:nth-child(even) td {
background: #f8fafc;
}
a {
color: #2563eb;
text-decoration: none;
}
ul,
ol {
margin: 6px 0 10px 20px;
padding: 0;
}
li {
margin: 3px 0;
}

View File

@ -1,291 +0,0 @@
#include <cublasLt.h>
#include <cuda_bf16.h>
#include <cuda_fp8.h>
#include <cuda_runtime.h>
#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <numeric>
#include <string>
#include <vector>
#define CHECK_CUDA(call) \
do { \
cudaError_t status = (call); \
if (status != cudaSuccess) { \
std::fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \
cudaGetErrorString(status)); \
std::exit(1); \
} \
} while (0)
#define CHECK_CUBLAS(call) \
do { \
cublasStatus_t status = (call); \
if (status != CUBLAS_STATUS_SUCCESS) { \
std::fprintf(stderr, "cuBLASLt error %s:%d: status=%d\n", __FILE__, \
__LINE__, static_cast<int>(status)); \
std::exit(1); \
} \
} while (0)
__global__ void fill_fp8(__nv_fp8_e4m3 *ptr, size_t count, float value) {
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = tid; i < count; i += stride) {
ptr[i] = __nv_fp8_e4m3(value);
}
}
struct Args {
int matrix_size = 8192;
int warmup = 20;
int iterations = 200;
int first_gpu = 0;
int gpu_count = -1;
size_t workspace_mb = 256;
int fast_accum = 1;
};
static Args parse_args(int argc, char **argv) {
Args args;
for (int i = 1; i < argc; ++i) {
auto need = [&](const char *name) {
if (i + 1 >= argc) {
std::fprintf(stderr, "Missing value for %s\n", name);
std::exit(2);
}
return argv[++i];
};
if (!std::strcmp(argv[i], "--matrix-size")) {
args.matrix_size = std::atoi(need(argv[i]));
} else if (!std::strcmp(argv[i], "--warmup")) {
args.warmup = std::atoi(need(argv[i]));
} else if (!std::strcmp(argv[i], "--iterations")) {
args.iterations = std::atoi(need(argv[i]));
} else if (!std::strcmp(argv[i], "--first-gpu")) {
args.first_gpu = std::atoi(need(argv[i]));
} else if (!std::strcmp(argv[i], "--gpu-count")) {
args.gpu_count = std::atoi(need(argv[i]));
} else if (!std::strcmp(argv[i], "--workspace-mb")) {
args.workspace_mb = static_cast<size_t>(std::atoll(need(argv[i])));
} else if (!std::strcmp(argv[i], "--fast-accum")) {
args.fast_accum = std::atoi(need(argv[i]));
} else if (!std::strcmp(argv[i], "--help") || !std::strcmp(argv[i], "-h")) {
std::puts("Usage: cublaslt_fp8_gemm_bench [--matrix-size N] [--warmup N] "
"[--iterations N] [--first-gpu N] [--gpu-count N] "
"[--workspace-mb N] [--fast-accum 0|1]");
std::exit(0);
} else {
std::fprintf(stderr, "Unknown argument: %s\n", argv[i]);
std::exit(2);
}
}
return args;
}
static double run_one_gpu(int gpu, const Args &args) {
CHECK_CUDA(cudaSetDevice(gpu));
const int64_t m = args.matrix_size;
const int64_t n = args.matrix_size;
const int64_t k = args.matrix_size;
const size_t a_elems = static_cast<size_t>(m) * k;
const size_t b_elems = static_cast<size_t>(k) * n;
const size_t d_elems = static_cast<size_t>(m) * n;
__nv_fp8_e4m3 *d_a = nullptr;
__nv_fp8_e4m3 *d_b = nullptr;
__nv_bfloat16 *d_d = nullptr;
void *workspace = nullptr;
float *d_scale_a = nullptr;
float *d_scale_b = nullptr;
const float scale = 1.0f;
const size_t workspace_bytes = args.workspace_mb * 1024ULL * 1024ULL;
CHECK_CUDA(cudaMalloc(&d_a, a_elems * sizeof(__nv_fp8_e4m3)));
CHECK_CUDA(cudaMalloc(&d_b, b_elems * sizeof(__nv_fp8_e4m3)));
CHECK_CUDA(cudaMalloc(&d_d, d_elems * sizeof(__nv_bfloat16)));
CHECK_CUDA(cudaMalloc(&workspace, workspace_bytes));
CHECK_CUDA(cudaMalloc(&d_scale_a, sizeof(float)));
CHECK_CUDA(cudaMalloc(&d_scale_b, sizeof(float)));
CHECK_CUDA(cudaMemcpy(d_scale_a, &scale, sizeof(scale), cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(d_scale_b, &scale, sizeof(scale), cudaMemcpyHostToDevice));
const int threads = 256;
const int blocks = 4096;
fill_fp8<<<blocks, threads>>>(d_a, a_elems, 0.01f);
fill_fp8<<<blocks, threads>>>(d_b, b_elems, 0.01f);
CHECK_CUDA(cudaMemset(d_d, 0, d_elems * sizeof(__nv_bfloat16)));
CHECK_CUDA(cudaGetLastError());
CHECK_CUDA(cudaDeviceSynchronize());
cublasLtHandle_t lt;
cublasLtMatmulDesc_t op_desc;
cublasLtMatrixLayout_t a_desc, b_desc, d_desc;
cublasLtMatmulPreference_t preference;
CHECK_CUBLAS(cublasLtCreate(&lt));
CHECK_CUBLAS(cublasLtMatmulDescCreate(&op_desc, CUBLAS_COMPUTE_32F, CUDA_R_32F));
// cuBLASLt FP8 kernels require TN format: A is transposed, B is non-transposed.
// With square GEMMs this keeps the benchmark FLOP count identical to the PDF
// acceptance shape while satisfying the library's FP8 kernel constraints.
cublasOperation_t transa = CUBLAS_OP_T;
cublasOperation_t transb = CUBLAS_OP_N;
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
op_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)));
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
op_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb)));
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
op_desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &d_scale_a,
sizeof(d_scale_a)));
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
op_desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &d_scale_b,
sizeof(d_scale_b)));
int8_t fast_accum = args.fast_accum ? 1 : 0;
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
op_desc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fast_accum,
sizeof(fast_accum)));
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&a_desc, CUDA_R_8F_E4M3, k, m, k));
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&b_desc, CUDA_R_8F_E4M3, k, n, k));
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&d_desc, CUDA_R_16BF, m, n, m));
CHECK_CUBLAS(cublasLtMatmulPreferenceCreate(&preference));
CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute(
preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_bytes,
sizeof(workspace_bytes)));
cublasLtMatmulHeuristicResult_t heuristic;
int returned = 0;
CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic(
lt, op_desc, a_desc, b_desc, d_desc, d_desc, preference, 1, &heuristic,
&returned));
if (returned == 0) {
std::fprintf(stderr, "No cuBLASLt heuristic returned for GPU %d\n", gpu);
std::exit(1);
}
auto get_algo_attr_i32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
int32_t value = -1;
size_t written = 0;
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
&heuristic.algo, attr, &value, sizeof(value), &written));
return static_cast<int>(value);
};
auto get_algo_attr_u32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
uint32_t value = 0;
size_t written = 0;
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
&heuristic.algo, attr, &value, sizeof(value), &written));
return static_cast<int>(value);
};
auto get_algo_attr_u16 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
uint16_t value = 0;
size_t written = 0;
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
&heuristic.algo, attr, &value, sizeof(value), &written));
return static_cast<int>(value);
};
const int algo_id = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_ID);
const int tile_id = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_TILE_ID);
const int splitk = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_SPLITK_NUM);
const int stages = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_STAGES_ID);
const int inner_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID);
const int cluster_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID);
const float alpha = 1.0f;
const float beta = 0.0f;
auto matmul = [&]() {
CHECK_CUBLAS(cublasLtMatmul(lt, op_desc, &alpha, d_a, a_desc, d_b, b_desc,
&beta, d_d, d_desc, d_d, d_desc,
&heuristic.algo, workspace, workspace_bytes, 0));
};
for (int i = 0; i < args.warmup; ++i) {
matmul();
}
CHECK_CUDA(cudaDeviceSynchronize());
cudaEvent_t start, stop;
CHECK_CUDA(cudaEventCreate(&start));
CHECK_CUDA(cudaEventCreate(&stop));
CHECK_CUDA(cudaEventRecord(start));
for (int i = 0; i < args.iterations; ++i) {
matmul();
}
CHECK_CUDA(cudaEventRecord(stop));
CHECK_CUDA(cudaEventSynchronize(stop));
float elapsed_ms = 0.0f;
CHECK_CUDA(cudaEventElapsedTime(&elapsed_ms, start, stop));
const double flops =
2.0 * static_cast<double>(m) * static_cast<double>(n) *
static_cast<double>(k) * static_cast<double>(args.iterations);
const double tflops = flops / (static_cast<double>(elapsed_ms) / 1000.0) / 1e12;
std::printf(
" {\"index\": %d, \"fp8_tflops\": %.1f, \"algo_id\": %d, "
"\"tile_id\": %d, \"splitk\": %d, \"stages_id\": %d, "
"\"inner_shape_id\": %d, \"cluster_shape_id\": %d}%s\n",
gpu, tflops, algo_id, tile_id, splitk, stages, inner_shape, cluster_shape,
(gpu + 1 == args.first_gpu + args.gpu_count) ? "" : ",");
std::fflush(stdout);
CHECK_CUDA(cudaEventDestroy(start));
CHECK_CUDA(cudaEventDestroy(stop));
CHECK_CUBLAS(cublasLtMatmulPreferenceDestroy(preference));
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(a_desc));
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(b_desc));
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(d_desc));
CHECK_CUBLAS(cublasLtMatmulDescDestroy(op_desc));
CHECK_CUBLAS(cublasLtDestroy(lt));
CHECK_CUDA(cudaFree(d_a));
CHECK_CUDA(cudaFree(d_b));
CHECK_CUDA(cudaFree(d_d));
CHECK_CUDA(cudaFree(workspace));
CHECK_CUDA(cudaFree(d_scale_a));
CHECK_CUDA(cudaFree(d_scale_b));
CHECK_CUDA(cudaDeviceSynchronize());
return tflops;
}
int main(int argc, char **argv) {
Args args = parse_args(argc, argv);
int device_count = 0;
CHECK_CUDA(cudaGetDeviceCount(&device_count));
if (args.gpu_count < 0) {
args.gpu_count = device_count - args.first_gpu;
}
if (args.first_gpu < 0 || args.first_gpu + args.gpu_count > device_count) {
std::fprintf(stderr, "Invalid GPU range first=%d count=%d device_count=%d\n",
args.first_gpu, args.gpu_count, device_count);
return 2;
}
std::vector<double> values;
std::printf("{\n");
std::printf(" \"source\": \"cuBLASLt\",\n");
std::printf(" \"dtype\": \"fp8_e4m3_inputs_bf16_output_fp32_accum\",\n");
std::printf(" \"matrix_size\": %d,\n", args.matrix_size);
std::printf(" \"warmup\": %d,\n", args.warmup);
std::printf(" \"iterations\": %d,\n", args.iterations);
std::printf(" \"fast_accum\": %d,\n", args.fast_accum ? 1 : 0);
std::printf(" \"per_gpu\": [\n");
for (int i = 0; i < args.gpu_count; ++i) {
int gpu = args.first_gpu + i;
double tflops = run_one_gpu(gpu, args);
values.push_back(tflops);
}
double mean = std::accumulate(values.begin(), values.end(), 0.0) / values.size();
auto minmax = std::minmax_element(values.begin(), values.end());
double spread = ((*minmax.second - *minmax.first) / mean) * 100.0;
std::printf(" ],\n");
std::printf(" \"mean_tflops\": %.1f,\n", mean);
std::printf(" \"min_tflops\": %.1f,\n", *minmax.first);
std::printf(" \"max_tflops\": %.1f,\n", *minmax.second);
std::printf(" \"spread_pct\": %.2f\n", spread);
std::printf("}\n");
return mean >= 1400.0 ? 0 : 1;
}

View File

@ -1,277 +0,0 @@
#!/usr/bin/env python3
"""Compare FP8 GEMM paths used for H100/H200 acceptance debugging.
Paths:
A. torch._scaled_mm eager, default accumulation
B. torch._scaled_mm eager, use_fast_accum=True
C. CUDA Graph replay of torch._scaled_mm(out=..., use_fast_accum=True)
D. Transformer Engine Linear under fp8_autocast, when installed
"""
from __future__ import annotations
import argparse
import json
import statistics
import sys
import time
from typing import Any, Callable
import torch
def tflops_from_ms(matrix_size: int, iterations: int, elapsed_ms: float) -> float:
flops = 2.0 * matrix_size * matrix_size * matrix_size * iterations
return flops / (elapsed_ms / 1000.0) / 1e12
def cuda_event_bench(
name: str,
matrix_size: int,
iterations: int,
warmup: int,
func: Callable[[int], Any],
) -> dict[str, Any]:
for i in range(warmup):
func(i)
torch.cuda.synchronize()
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
wall_start = time.perf_counter()
start.record()
for i in range(iterations):
func(i)
end.record()
torch.cuda.synchronize()
wall_elapsed = time.perf_counter() - wall_start
elapsed_ms = start.elapsed_time(end)
return {
"name": name,
"status": "ok",
"matrix_size": matrix_size,
"iterations": iterations,
"warmup": warmup,
"event_ms_total": round(elapsed_ms, 3),
"event_us_per_iter": round(elapsed_ms * 1000.0 / iterations, 3),
"wall_ms_total": round(wall_elapsed * 1000.0, 3),
"tflops": round(tflops_from_ms(matrix_size, iterations, elapsed_ms), 1),
}
def make_fp8_inputs(matrix_size: int, pools: int, device: str) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
a = [
torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn)
for _ in range(pools)
]
b = [
torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn)
for _ in range(pools)
]
torch.cuda.synchronize()
return a, b
def bench_scaled_mm(args: argparse.Namespace) -> list[dict[str, Any]]:
device = f"cuda:{args.gpu_index}"
torch.cuda.set_device(args.gpu_index)
scale_a = torch.tensor(1.0, device=device)
scale_b = torch.tensor(1.0, device=device)
pools_a, pools_b = make_fp8_inputs(args.matrix_size, args.pools, device)
results: list[dict[str, Any]] = []
def eager_default(i: int) -> torch.Tensor:
idx = i % args.pools
return torch._scaled_mm(
pools_a[idx],
pools_b[idx].T,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.bfloat16,
)
def eager_fast(i: int) -> torch.Tensor:
idx = i % args.pools
return torch._scaled_mm(
pools_a[idx],
pools_b[idx].T,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.bfloat16,
use_fast_accum=True,
)
results.append(
cuda_event_bench(
"A_eager_scaled_mm_default",
args.matrix_size,
args.iterations,
args.warmup,
eager_default,
)
)
results.append(
cuda_event_bench(
"B_eager_scaled_mm_fast_accum",
args.matrix_size,
args.iterations,
args.warmup,
eager_fast,
)
)
graph_out = torch.empty(
(args.matrix_size, args.matrix_size),
device=device,
dtype=torch.bfloat16,
)
static_a = pools_a[0]
static_b_t = pools_b[0].T
try:
side_stream = torch.cuda.Stream()
side_stream.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(side_stream):
for _ in range(max(3, args.warmup // 2)):
torch._scaled_mm(
static_a,
static_b_t,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.bfloat16,
use_fast_accum=True,
out=graph_out,
)
torch.cuda.current_stream().wait_stream(side_stream)
torch.cuda.synchronize()
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph):
torch._scaled_mm(
static_a,
static_b_t,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.bfloat16,
use_fast_accum=True,
out=graph_out,
)
def graph_replay(_: int) -> None:
graph.replay()
results.append(
cuda_event_bench(
"C_cuda_graph_scaled_mm_fast_accum",
args.matrix_size,
args.iterations,
3,
graph_replay,
)
)
except Exception as exc: # noqa: BLE001
results.append(
{
"name": "C_cuda_graph_scaled_mm_fast_accum",
"status": "unavailable",
"reason": f"{type(exc).__name__}: {exc}",
}
)
return results
def bench_transformer_engine(args: argparse.Namespace) -> dict[str, Any]:
try:
import transformer_engine.pytorch as te # type: ignore[import-not-found]
from transformer_engine.common.recipe import DelayedScaling, Format # type: ignore[import-not-found]
except Exception as exc: # noqa: BLE001
return {
"name": "D_transformer_engine_fp8_linear",
"status": "unavailable",
"reason": f"{type(exc).__name__}: {exc}",
}
device = f"cuda:{args.gpu_index}"
x = torch.randn(args.matrix_size, args.matrix_size, device=device, dtype=torch.bfloat16)
layer = te.Linear(
args.matrix_size,
args.matrix_size,
bias=False,
params_dtype=torch.bfloat16,
device=device,
)
recipe = DelayedScaling(fp8_format=Format.HYBRID)
def run(_: int) -> torch.Tensor:
with te.fp8_autocast(enabled=True, fp8_recipe=recipe):
return layer(x)
try:
result = cuda_event_bench(
"D_transformer_engine_fp8_linear",
args.matrix_size,
args.iterations,
args.warmup,
run,
)
except Exception as exc: # noqa: BLE001
return {
"name": "D_transformer_engine_fp8_linear",
"status": "error",
"reason": f"{type(exc).__name__}: {exc}",
}
result["note"] = "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
return result
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--matrix-size", type=int, default=8192)
parser.add_argument("--warmup", type=int, default=20)
parser.add_argument("--iterations", type=int, default=100)
parser.add_argument("--gpu-index", type=int, default=0)
parser.add_argument("--pools", type=int, default=4)
args = parser.parse_args()
if not torch.cuda.is_available():
print(json.dumps({"error": "cuda unavailable"}, indent=2))
return 1
if not hasattr(torch, "_scaled_mm") or not hasattr(torch, "float8_e4m3fn"):
print(json.dumps({"error": "torch FP8 _scaled_mm unavailable"}, indent=2))
return 1
torch.cuda.set_device(args.gpu_index)
props = torch.cuda.get_device_properties(args.gpu_index)
payload = {
"source": "pytorch_fp8_path_bench",
"torch": torch.__version__,
"cuda": torch.version.cuda,
"gpu_index": args.gpu_index,
"gpu_name": props.name,
"matrix_size": args.matrix_size,
"warmup": args.warmup,
"iterations": args.iterations,
"results": [],
}
try:
payload["results"].extend(bench_scaled_mm(args))
payload["results"].append(bench_transformer_engine(args))
except torch.cuda.OutOfMemoryError as exc:
payload["error"] = f"CUDA OOM: {exc}"
print(json.dumps(payload, indent=2))
return 1
ok_values = [r["tflops"] for r in payload["results"] if r.get("status") == "ok"]
if ok_values:
payload["summary"] = {
"max_tflops": round(max(ok_values), 1),
"min_tflops": round(min(ok_values), 1),
"mean_tflops": round(statistics.mean(ok_values), 1),
}
print(json.dumps(payload, indent=2))
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@ -1,45 +0,0 @@
#!/usr/bin/env bash
set -uo pipefail
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
MATRIX_SIZE="${MATRIX_SIZE:-8192}"
WARMUP="${WARMUP:-20}"
ITERATIONS="${ITERATIONS:-200}"
GPU_COUNT="${GPU_COUNT:-8}"
FIRST_GPU="${FIRST_GPU:-0}"
WORKSPACE_MB="${WORKSPACE_MB:-256}"
if [[ ! -x "$NVCC" ]]; then
echo "nvcc not found: $NVCC" >&2
exit 1
fi
mkdir -p "$OUT_DIR" "$PROJECT_DIR/build"
HOST="$(hostname 2>/dev/null || echo unknown)"
TS="$(date +%Y%m%d_%H%M%S)"
BIN="$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
REPORT="$OUT_DIR/cublaslt_fp8_gemm_${HOST}_${TS}.json"
"$NVCC" -O3 -std=c++17 -arch=sm_90 \
"$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
-lcublasLt -lcublas -o "$BIN"
set +e
"$BIN" \
--matrix-size "$MATRIX_SIZE" \
--warmup "$WARMUP" \
--iterations "$ITERATIONS" \
--first-gpu "$FIRST_GPU" \
--gpu-count "$GPU_COUNT" \
--workspace-mb "$WORKSPACE_MB" \
| tee "$REPORT"
status=${PIPESTATUS[0]}
set -e
echo "Report written to: $REPORT"
exit "$status"

View File

@ -1,93 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
PYTHON="${PYTHON:-/root/gpu-test-venv/bin/python}"
CUDA_HOME="${CUDA_HOME:-/usr/local/cuda-12.4}"
NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
MATRIX_SIZE="${MATRIX_SIZE:-8192}"
WARMUP="${WARMUP:-20}"
ITERATIONS="${ITERATIONS:-100}"
GPU_INDEX="${GPU_INDEX:-0}"
WORKSPACE_MB="${WORKSPACE_MB:-256}"
VENV_SITE_PACKAGES="$("$PYTHON" - <<'PY'
import site
print(site.getsitepackages()[0])
PY
)"
export LD_LIBRARY_PATH="$VENV_SITE_PACKAGES/nvidia/cudnn/lib:$VENV_SITE_PACKAGES/nvidia/nccl/lib:${LD_LIBRARY_PATH:-}"
mkdir -p "$PROJECT_DIR/build" "$OUT_DIR"
HOST="$(hostname 2>/dev/null || echo unknown)"
TS="$(date +%Y%m%d_%H%M%S)"
PY_REPORT="$OUT_DIR/fp8_paths_pytorch_${HOST}_${TS}.json"
CUBLAS_REPORT="$OUT_DIR/fp8_paths_cublaslt_${HOST}_${TS}.json"
COMBINED_REPORT="$OUT_DIR/fp8_paths_combined_${HOST}_${TS}.json"
"$PYTHON" "$PROJECT_DIR/scripts/pytorch_fp8_path_bench.py" \
--matrix-size "$MATRIX_SIZE" \
--warmup "$WARMUP" \
--iterations "$ITERATIONS" \
--gpu-index "$GPU_INDEX" | tee "$PY_REPORT"
"$NVCC" -O3 -std=c++17 -arch=sm_90 \
"$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
-lcublasLt -lcublas -o "$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
"$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" \
--matrix-size "$MATRIX_SIZE" \
--warmup "$WARMUP" \
--iterations "$ITERATIONS" \
--first-gpu "$GPU_INDEX" \
--gpu-count 1 \
--workspace-mb "$WORKSPACE_MB" \
--fast-accum 1 | tee "$CUBLAS_REPORT"
"$PYTHON" - "$PY_REPORT" "$CUBLAS_REPORT" "$COMBINED_REPORT" <<'PY'
import json
import pathlib
import sys
py_report = pathlib.Path(sys.argv[1])
cublas_report = pathlib.Path(sys.argv[2])
combined_report = pathlib.Path(sys.argv[3])
with py_report.open() as f:
py_payload = json.load(f)
with cublas_report.open() as f:
cublas_payload = json.load(f)
combined = {
"source": "fp8_path_comparison",
"host": cublas_payload.get("host"),
"matrix_size": py_payload.get("matrix_size"),
"gpu_index": py_payload.get("gpu_index"),
"pytorch": py_payload,
"cublaslt": cublas_payload,
"results": [],
}
combined["results"].extend(py_payload.get("results", []))
per_gpu = cublas_payload.get("per_gpu", [])
if per_gpu:
row = dict(per_gpu[0])
row.update({
"name": "E_direct_cublaslt_fast_accum",
"status": "ok",
"tflops": row.pop("fp8_tflops"),
"matrix_size": cublas_payload.get("matrix_size"),
"iterations": cublas_payload.get("iterations"),
"warmup": cublas_payload.get("warmup"),
"fast_accum": cublas_payload.get("fast_accum"),
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager.",
})
combined["results"].append(row)
combined_report.write_text(json.dumps(combined, indent=2), encoding="utf-8")
print(f"Combined report written to: {combined_report}")
PY
echo "$COMBINED_REPORT"