Compare commits
37 Commits
4dddab27b3
...
211140e7f1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
211140e7f1 | ||
|
|
9b0e6e29df | ||
|
|
cfca4df62a | ||
|
|
6d72c67468 | ||
|
|
64f203a9be | ||
|
|
c48f9f915d | ||
|
|
0a772103f0 | ||
|
|
f5699bf85a | ||
|
|
59621d26a0 | ||
|
|
18cebd8e06 | ||
|
|
1a8cf6cbbb | ||
|
|
63c32fd75d | ||
|
|
4f38b3a2a0 | ||
|
|
8ff5021385 | ||
|
|
bdffd7e616 | ||
|
|
b6b1ccc2dc | ||
|
|
ec6b868d3f | ||
|
|
3a0e739991 | ||
|
|
b914cb7b4b | ||
|
|
adcdb36e05 | ||
|
|
0d63ea5e05 | ||
|
|
08e0d93a16 | ||
|
|
9a32645c9d | ||
|
|
71ac97a24e | ||
|
|
890e623be4 | ||
|
|
17441a4583 | ||
|
|
05294a66d8 | ||
|
|
6b302c8b09 | ||
|
|
cd1728a7c3 | ||
|
|
48a27472e2 | ||
|
|
49d358c0ca | ||
|
|
519de86553 | ||
|
|
3268f4bd2a | ||
|
|
b82ac54218 | ||
|
|
28fe55d5c7 | ||
|
|
ac91f1aeb5 | ||
|
|
2a51be1ba3 |
@ -1,4 +1,4 @@
|
||||
# GPU type: auto-detect or override to a100/a800/h100/h800/h200/h20/b200/b300
|
||||
# GPU type: auto-detect or override to a100/a800/h100/h200/b200/b300
|
||||
gpu_type: auto
|
||||
|
||||
benchmark:
|
||||
@ -14,25 +14,12 @@ benchmark:
|
||||
- fp16
|
||||
- bf16
|
||||
- fp8
|
||||
# MAMF-style shape sweep: measure each dtype at every shape below and keep the max
|
||||
# TFLOPS (the realistic achievable peak). A single fixed shape under-reports by
|
||||
# ~7-12% and can't meet the MAMF-calibrated thresholds in gpu_specs.py.
|
||||
# Each entry is either N (square N×N×N) or [M, N, K]. K-heavy non-square shapes
|
||||
# (e.g. 2048×2048×13312) hit the true Hopper MAMF — bf16 ~790 vs ~755 square.
|
||||
# Empty list => single matrix_size shape (legacy behaviour).
|
||||
sweep_sizes:
|
||||
- 3584
|
||||
- 4608
|
||||
- 5376
|
||||
- 8192
|
||||
- 11520
|
||||
- [2048, 2048, 13312]
|
||||
- [2048, 2048, 16384]
|
||||
matrix_size: 8192 # fallback shape when sweep_sizes is empty
|
||||
warmup: 20
|
||||
iterations: 80
|
||||
# NOTE: torch.compile was dropped — on H100 eager cuBLAS beats Triton for plain
|
||||
# GEMM, and compiling would re-autotune per shape and make the sweep very slow.
|
||||
- fp64
|
||||
- int8
|
||||
matrix_size: 8192
|
||||
warmup: 50
|
||||
iterations: 500
|
||||
use_compile: true
|
||||
|
||||
health:
|
||||
temp_warning: 75
|
||||
@ -44,9 +31,15 @@ nccl:
|
||||
test_allreduce: true
|
||||
test_alltoall: true
|
||||
test_broadcast: true
|
||||
test_reduce_scatter: false
|
||||
test_allgather: false
|
||||
test_sendrecv: false
|
||||
test_reduce_scatter: true
|
||||
test_allgather: true
|
||||
test_sendrecv: true
|
||||
message_sizes:
|
||||
- 1M
|
||||
- 256M
|
||||
- 2G
|
||||
repeats: 3
|
||||
max_stddev_pct: 3
|
||||
|
||||
multinode_nccl:
|
||||
enabled: false
|
||||
@ -98,44 +91,66 @@ multinode_nccl:
|
||||
alltoall: 75
|
||||
|
||||
stress:
|
||||
duration_sec: 600 # 10 min — reaches thermal steady state, validates throttle/jitter beyond warmup
|
||||
duration_sec: 1800
|
||||
production_duration_sec: 1800
|
||||
use_gpu_burn: false
|
||||
use_doubles: false
|
||||
use_tensor_cores: true
|
||||
memory_pct: 90
|
||||
gpus: all
|
||||
dtype: bf16
|
||||
matrix_size: 24576
|
||||
telemetry_interval_sec: 1
|
||||
warmup_sec: 60
|
||||
min_steady_samples: 10
|
||||
max_temp_c: 80
|
||||
max_temp_delta_c: 5
|
||||
min_power_watts: 630
|
||||
max_tflops_jitter_pct: 5
|
||||
require_tflops_jitter: true
|
||||
|
||||
rdma:
|
||||
min_bandwidth_gbps: 50
|
||||
max_latency_us: 10
|
||||
min_bandwidth_gbps: 47
|
||||
min_port_rate_gbps: 400
|
||||
max_latency_us: 3.5
|
||||
max_write_latency_us: 2.0
|
||||
max_read_latency_us: 3.5
|
||||
ib_iterations: 1000
|
||||
msg_size: 65536
|
||||
msg_size: 4194304
|
||||
latency_msg_size: 8
|
||||
ib_device: null
|
||||
ib_port: 1
|
||||
# Cross-node (two-host) RDMA via perftest, orchestrated over SSH from the CLIENT
|
||||
# node. Replaces the old scripts/rdma_cross_node.sh. Run on the client; it starts
|
||||
# ib_write_bw/ib_write_lat servers on `server` over SSH (passwordless required),
|
||||
# then drives the local client per device.
|
||||
cross_node:
|
||||
enabled: false # set true on the client node to run cross-node RDMA
|
||||
server: null # peer ssh address, e.g. 172.72.8.12 (server node)
|
||||
server_addr: null # OOB addr client connects to (default: = server)
|
||||
ssh_user: root
|
||||
devices: [] # e.g. [mlx5_0, mlx5_1, mlx5_6, mlx5_7]; [] = auto-detect active IB
|
||||
ib_port: 1
|
||||
gid_index: null # -x <n> for RoCE; null for pure InfiniBand
|
||||
msg_size: 1048576 # 1 MiB — large enough to reach NDR400 peak
|
||||
iters: 5000
|
||||
base_oob_port: 18515 # per-device OOB port = base + device index
|
||||
server_warmup_sec: 2.0
|
||||
min_bandwidth_gbps: 350 # per-port PASS floor (NDR400 ≈ 0.9 × 400)
|
||||
max_latency_us: 5
|
||||
server_addr: null
|
||||
ibping_target: null
|
||||
ibping_count: 5
|
||||
role: auto
|
||||
pfc_ecn_counters: true
|
||||
|
||||
nvlink:
|
||||
expected_links_per_gpu: 18
|
||||
expected_link_speed_gbps: 25
|
||||
require_zero_errors: true
|
||||
|
||||
dcgm:
|
||||
diag_level: 3
|
||||
timeout_sec: 3600
|
||||
expected_num_gpus: 8
|
||||
json_output: true
|
||||
require_subtests: true
|
||||
|
||||
training:
|
||||
model: gpt2
|
||||
model: synthetic_1.5b
|
||||
batch_size: 8
|
||||
seq_length: 2048
|
||||
num_steps: 50
|
||||
warmup_steps: 5
|
||||
dtype: bf16
|
||||
mode: ddp
|
||||
synthetic_params_b: 1.5
|
||||
min_tokens_per_sec: 45000
|
||||
max_step_jitter_pct: 3
|
||||
max_peak_memory_gb: 70
|
||||
require_distributed: true
|
||||
|
||||
report:
|
||||
output_dir: ./reports
|
||||
|
||||
@ -312,31 +312,10 @@ class Benchmark:
|
||||
def run_compute_benchmark(self, dtypes: Optional[List[str]] = None) -> dict:
|
||||
comp_cfg = self.bench_cfg.get("compute", {})
|
||||
configured_dtypes = dtypes or comp_cfg.get("dtypes", ["fp32", "tf32", "fp16", "bf16", "fp8"])
|
||||
|
||||
# MAMF-style shape sweep (à la stas00's mamf-finder): a single fixed matmul
|
||||
# shape under-reports the achievable peak by ~7-12% and therefore can't meet
|
||||
# the MAMF-calibrated PASS thresholds in gpu_specs.compute_pass_thresholds_tflops.
|
||||
# So for each dtype we time several matmul shapes and keep the MAXIMUM TFLOPS
|
||||
# (the realistic peak). matrix_size is the fallback when sweep_sizes is empty.
|
||||
matrix_size = comp_cfg.get("matrix_size", 8192)
|
||||
sweep_sizes = comp_cfg.get("sweep_sizes") or [matrix_size]
|
||||
warmup = comp_cfg.get("warmup", 20)
|
||||
iterations = comp_cfg.get("iterations", 80)
|
||||
|
||||
# Each sweep entry is either an int N (square N×N×N) or an [M, N, K] triple.
|
||||
# Non-square / K-heavy shapes (e.g. 2048×2048×13312) reach the true MAMF peak
|
||||
# on Hopper — square-only tops out ~5% lower — so the default set mixes both.
|
||||
def _to_shape(entry):
|
||||
if isinstance(entry, (list, tuple)):
|
||||
if len(entry) == 3:
|
||||
return tuple(int(x) for x in entry)
|
||||
if len(entry) == 1:
|
||||
n = int(entry[0])
|
||||
return (n, n, n)
|
||||
raise ValueError(f"sweep size {entry!r} must be an int or [M, N, K]")
|
||||
n = int(entry)
|
||||
return (n, n, n)
|
||||
shapes = [_to_shape(e) for e in sweep_sizes]
|
||||
matrix_size = comp_cfg.get("matrix_size", 4096)
|
||||
warmup = comp_cfg.get("warmup", 10)
|
||||
iterations = comp_cfg.get("iterations", 100)
|
||||
use_compile = comp_cfg.get("use_compile", False)
|
||||
|
||||
if not TORCH_AVAILABLE:
|
||||
self.console.print("[yellow]PyTorch not available - skipping compute benchmark[/yellow]")
|
||||
@ -344,22 +323,37 @@ class Benchmark:
|
||||
|
||||
gpu_count = torch.cuda.device_count()
|
||||
self.console.print(f"[cyan]Compute Benchmark - {gpu_count} GPU(s)[/cyan]")
|
||||
if len(sweep_sizes) > 1:
|
||||
self.console.print(
|
||||
f"[cyan] MAMF shape sweep over {len(sweep_sizes)} sizes: "
|
||||
f"{', '.join(str(s) for s in sweep_sizes)}[/cyan]"
|
||||
)
|
||||
|
||||
# torch.compile(max-autotune) benchmarks cuBLAS vs Triton kernels and picks
|
||||
# the fastest for this GPU/shape, typically improving efficiency by 8-15%.
|
||||
# compile_warmup must be larger than warmup to absorb JIT + autotuning time.
|
||||
mm_fn = torch.matmul
|
||||
compile_warmup = warmup
|
||||
if use_compile:
|
||||
try:
|
||||
_compiled = torch.compile(torch.matmul, mode="max-autotune")
|
||||
# Trial call to trigger JIT and verify compilation succeeds before the dtype loop.
|
||||
_t = torch.randn(64, 64, device="cuda", dtype=torch.float32)
|
||||
_compiled(_t, _t)
|
||||
torch.cuda.synchronize()
|
||||
del _t
|
||||
mm_fn = _compiled
|
||||
compile_warmup = max(warmup, 50)
|
||||
self.console.print("[cyan] torch.compile(max-autotune) enabled[/cyan]")
|
||||
except Exception as e:
|
||||
self.console.print(f"[yellow] torch.compile unavailable ({type(e).__name__}), using eager[/yellow]")
|
||||
|
||||
dtype_map = {
|
||||
"fp32": (torch.float32, self.specs["fp32_tflops"]),
|
||||
"tf32": ("tf32", self.specs["tf32_tflops"]),
|
||||
"fp16": (torch.float16, self.specs["fp16_tflops"]),
|
||||
"bf16": (torch.bfloat16, self.specs["bf16_tflops"]),
|
||||
"fp8": (torch.float8_e4m3fn, self.specs["fp8_tflops"]),
|
||||
"fp32": (torch.float32, self.specs.get("fp32_tflops", 0)),
|
||||
"tf32": ("tf32", self.specs.get("tf32_tflops", 0)),
|
||||
"fp16": (torch.float16, self.specs.get("fp16_tflops", 0)),
|
||||
"bf16": (torch.bfloat16, self.specs.get("bf16_tflops", 0)),
|
||||
"fp8": (getattr(torch, "float8_e4m3fn", None), self.specs.get("fp8_tflops", 0)),
|
||||
"fp64": (torch.float64, self.specs.get("fp64_tflops", 0)),
|
||||
"int8": (torch.int8, self.specs.get("int8_tflops", 0)),
|
||||
}
|
||||
|
||||
results_by_dtype = {}
|
||||
best_shapes = {}
|
||||
per_gpu_results = [{"index": i} for i in range(gpu_count)]
|
||||
|
||||
with Progress(
|
||||
@ -382,41 +376,27 @@ class Benchmark:
|
||||
progress.advance(task)
|
||||
continue
|
||||
|
||||
dtype_val, peak_tflops = dtype_map[dtype_name]
|
||||
|
||||
# allow_tf32 only affects float32 matmuls: ON for the TF32 run, OFF for
|
||||
# the true-FP32 run so the two stay distinct.
|
||||
old_tf32 = torch.backends.cuda.matmul.allow_tf32
|
||||
if dtype_name == "tf32":
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
dtype_val = torch.float32
|
||||
elif dtype_name == "fp32":
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
|
||||
best_tflops, best_shape, last_err = 0.0, None, None
|
||||
for (M, N, K) in shapes:
|
||||
gpu_values = []
|
||||
errors = []
|
||||
for gpu_idx in range(gpu_count):
|
||||
try:
|
||||
t = self._bench_matmul_once(dtype_name, dtype_val, M, N, K, warmup, iterations)
|
||||
if t > best_tflops:
|
||||
best_tflops, best_shape = t, (M, N, K)
|
||||
except Exception as e: # noqa: BLE001 - record and try the next shape
|
||||
last_err = e
|
||||
|
||||
torch.backends.cuda.matmul.allow_tf32 = old_tf32
|
||||
|
||||
if best_shape is None:
|
||||
results_by_dtype[dtype_name] = f"error: {last_err}"
|
||||
self.console.print(f"[yellow] {dtype_name}: {last_err}[/yellow]")
|
||||
else:
|
||||
shape_str = "x".join(str(d) for d in best_shape)
|
||||
results_by_dtype[dtype_name] = round(best_tflops, 1)
|
||||
best_shapes[dtype_name] = shape_str
|
||||
for pg in per_gpu_results:
|
||||
pg[dtype_name] = round(best_tflops, 1)
|
||||
if len(shapes) > 1:
|
||||
self.console.print(
|
||||
f"[dim] {dtype_name}: {best_tflops:.1f} TFLOPS @ {shape_str}[/dim]"
|
||||
val = self._benchmark_dtype_on_gpu(
|
||||
dtype_name, dtype_map[dtype_name][0], matrix_size,
|
||||
warmup, compile_warmup, iterations, mm_fn, gpu_idx,
|
||||
)
|
||||
gpu_values.append(val)
|
||||
per_gpu_results[gpu_idx][dtype_name] = round(val, 1)
|
||||
except Exception as e:
|
||||
errors.append(f"gpu{gpu_idx}: {e}")
|
||||
per_gpu_results[gpu_idx][dtype_name] = f"error: {e}"
|
||||
finally:
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
if gpu_values:
|
||||
results_by_dtype[dtype_name] = round(sum(gpu_values) / len(gpu_values), 1)
|
||||
else:
|
||||
results_by_dtype[dtype_name] = "error: " + "; ".join(errors[:3])
|
||||
self.console.print(f"[yellow] {dtype_name}: {results_by_dtype[dtype_name]}[/yellow]")
|
||||
|
||||
progress.advance(task)
|
||||
|
||||
@ -427,78 +407,119 @@ class Benchmark:
|
||||
if peak_tp:
|
||||
efficiency[dt] = round((achieved / peak_tp) * 100, 1)
|
||||
|
||||
consistency = {}
|
||||
for dt in results_by_dtype:
|
||||
vals = [pg.get(dt) for pg in per_gpu_results]
|
||||
nums = [v for v in vals if isinstance(v, (int, float))]
|
||||
if len(nums) >= 2:
|
||||
mean = sum(nums) / len(nums)
|
||||
spread_pct = ((max(nums) - min(nums)) / mean * 100) if mean else 0
|
||||
consistency[dt] = {
|
||||
"mean_tflops": round(mean, 1),
|
||||
"min_tflops": round(min(nums), 1),
|
||||
"max_tflops": round(max(nums), 1),
|
||||
"spread_pct": round(spread_pct, 2),
|
||||
"max_allowed_pct": 3,
|
||||
"passed": spread_pct <= 3,
|
||||
}
|
||||
|
||||
pass_thresholds = dict(self.specs.get("compute_pass_thresholds_tflops") or {})
|
||||
threshold_passed = True
|
||||
for dt, threshold in pass_thresholds.items():
|
||||
val = results_by_dtype.get(dt)
|
||||
if not isinstance(val, (int, float)) or val < threshold:
|
||||
threshold_passed = False
|
||||
break
|
||||
consistency_passed = all(row.get("passed", False) for row in consistency.values()) if consistency else True
|
||||
|
||||
return {
|
||||
"compute": {
|
||||
"passed": threshold_passed and consistency_passed,
|
||||
"per_dtype_tflops": results_by_dtype,
|
||||
"peak_tflops": {dt: dtype_map[dt][1] for dt in dtype_map},
|
||||
"efficiency_pct": efficiency,
|
||||
# Absolute TFLOPS PASS thresholds (decoupled from peak). When present,
|
||||
# report.py judges PASS/WARN/FAIL against these directly instead of
|
||||
# using % of peak. Empty dict => fall back to legacy 80% rule.
|
||||
"pass_thresholds_tflops": dict(
|
||||
self.specs.get("compute_pass_thresholds_tflops") or {}
|
||||
),
|
||||
"pass_thresholds_tflops": pass_thresholds,
|
||||
"per_gpu": per_gpu_results,
|
||||
"sweep_sizes": list(sweep_sizes),
|
||||
"best_shapes": best_shapes,
|
||||
"consistency": consistency,
|
||||
"matrix_size": matrix_size,
|
||||
"warmup": warmup,
|
||||
"iterations": iterations,
|
||||
}
|
||||
}
|
||||
|
||||
def _bench_matmul_once(self, dtype_name: str, dtype_val, M: int, N: int, K: int,
|
||||
warmup: int, iterations: int) -> float:
|
||||
"""Time one (M×K)·(K×N) matmul for a dtype and return achieved TFLOPS.
|
||||
|
||||
Uses an L2-cache-busting pool of matrix pairs (total > 256 MB) so operands
|
||||
can't be served from L2 across iterations, and CUDA events for timing. FP8
|
||||
goes through torch._scaled_mm (e4m3); all others through torch.matmul — eager
|
||||
cuBLAS, which on H100 beats torch.compile/Triton for plain GEMM and avoids the
|
||||
per-shape recompile cost that would make a sweep pathologically slow.
|
||||
"""
|
||||
elem_bytes = 1 if dtype_name == "fp8" else torch.tensor([], dtype=dtype_val).element_size()
|
||||
pair_bytes = (M * K + K * N) * elem_bytes
|
||||
num_pools = max(4, -(-256 * 1024 * 1024 // pair_bytes)) # ceil(256MB / pair)
|
||||
|
||||
if dtype_name == "fp8":
|
||||
if not hasattr(torch, "_scaled_mm"):
|
||||
raise RuntimeError("torch._scaled_mm unavailable — upgrade to PyTorch >= 2.1")
|
||||
pools_a = [torch.randn(M, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn) for _ in range(num_pools)]
|
||||
pools_b = [torch.randn(N, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn) for _ in range(num_pools)]
|
||||
scale_a = torch.tensor(1.0, device="cuda")
|
||||
scale_b = torch.tensor(1.0, device="cuda")
|
||||
def op(i):
|
||||
return torch._scaled_mm(pools_a[i], pools_b[i].T, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.bfloat16)
|
||||
else:
|
||||
pools_a = [torch.randn(M, K, device="cuda", dtype=dtype_val) for _ in range(num_pools)]
|
||||
pools_b = [torch.randn(K, N, device="cuda", dtype=dtype_val) for _ in range(num_pools)]
|
||||
def op(i):
|
||||
return torch.matmul(pools_a[i], pools_b[i])
|
||||
|
||||
def _benchmark_dtype_on_gpu(self, dtype_name: str, dtype_val, matrix_size: int,
|
||||
warmup: int, compile_warmup: int, iterations: int,
|
||||
mm_fn, gpu_idx: int) -> float:
|
||||
if dtype_name == "fp8" and dtype_val is None:
|
||||
raise RuntimeError("torch.float8_e4m3fn unavailable")
|
||||
device = f"cuda:{gpu_idx}"
|
||||
old_tf32 = torch.backends.cuda.matmul.allow_tf32
|
||||
try:
|
||||
# Probe once so a broken/unsupported kernel raises before the timed loop.
|
||||
_probe = op(0)
|
||||
torch.cuda.synchronize()
|
||||
del _probe
|
||||
with torch.cuda.device(gpu_idx):
|
||||
if dtype_name == "tf32":
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
dtype_val = torch.float32
|
||||
|
||||
for i in range(warmup):
|
||||
op(i % num_pools)
|
||||
torch.cuda.synchronize()
|
||||
M = N = K = matrix_size
|
||||
if dtype_name == "int8" and M > 4096:
|
||||
# torch._int_mm on 8192 can be extremely memory hungry because the
|
||||
# output is int32. Keep it production-visible, but bounded.
|
||||
M = N = K = 4096
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
start_event.record()
|
||||
for i in range(iterations):
|
||||
op(i % num_pools)
|
||||
end_event.record()
|
||||
torch.cuda.synchronize()
|
||||
elapsed_ms = start_event.elapsed_time(end_event)
|
||||
elem_bytes = 1 if dtype_name in ("fp8", "int8") else torch.tensor([], dtype=dtype_val).element_size()
|
||||
pair_bytes = 2 * M * K * elem_bytes
|
||||
num_pools = max(4, -(-256 * 1024 * 1024 // pair_bytes))
|
||||
|
||||
if dtype_name == "fp8":
|
||||
if not hasattr(torch, "_scaled_mm"):
|
||||
raise RuntimeError("torch._scaled_mm unavailable")
|
||||
pools_a = [torch.randn(M, K, device=device, dtype=torch.float32).to(torch.float8_e4m3fn) for _ in range(num_pools)]
|
||||
pools_b = [torch.randn(N, K, device=device, dtype=torch.float32).to(torch.float8_e4m3fn) for _ in range(num_pools)]
|
||||
scale_a = torch.tensor(1.0, device=device)
|
||||
scale_b = torch.tensor(1.0, device=device)
|
||||
|
||||
def run(i):
|
||||
return torch._scaled_mm(pools_a[i], pools_b[i].T, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.bfloat16)
|
||||
|
||||
effective_warmup = warmup
|
||||
elif dtype_name == "int8":
|
||||
if not hasattr(torch, "_int_mm"):
|
||||
raise RuntimeError("torch._int_mm unavailable")
|
||||
pools_a = [torch.randint(-128, 127, (M, K), device=device, dtype=torch.int8) for _ in range(num_pools)]
|
||||
pools_b = [torch.randint(-128, 127, (K, N), device=device, dtype=torch.int8) for _ in range(num_pools)]
|
||||
|
||||
def run(i):
|
||||
return torch._int_mm(pools_a[i], pools_b[i])
|
||||
|
||||
effective_warmup = warmup
|
||||
else:
|
||||
pools_a = [torch.randn(M, K, device=device, dtype=dtype_val) for _ in range(num_pools)]
|
||||
pools_b = [torch.randn(K, N, device=device, dtype=dtype_val) for _ in range(num_pools)]
|
||||
|
||||
def run(i):
|
||||
return mm_fn(pools_a[i], pools_b[i])
|
||||
|
||||
effective_warmup = compile_warmup
|
||||
|
||||
for i in range(effective_warmup):
|
||||
run(i % num_pools)
|
||||
torch.cuda.synchronize()
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
start_event.record()
|
||||
for i in range(iterations):
|
||||
c = run(i % num_pools)
|
||||
end_event.record()
|
||||
torch.cuda.synchronize()
|
||||
elapsed_ms = start_event.elapsed_time(end_event)
|
||||
del pools_a, pools_b, c
|
||||
flops = 2 * M * N * K * iterations
|
||||
return flops / (elapsed_ms / 1000) / 1e12
|
||||
finally:
|
||||
del pools_a, pools_b
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return (2 * M * N * K * iterations) / (elapsed_ms / 1000) / 1e12
|
||||
torch.backends.cuda.matmul.allow_tf32 = old_tf32
|
||||
|
||||
@staticmethod
|
||||
def print_results(results: dict, console: Console = None):
|
||||
@ -583,77 +604,24 @@ class Benchmark:
|
||||
f"[{ec}]{ef:.1f}%[/{ec}]")
|
||||
c.print(table)
|
||||
|
||||
@staticmethod
|
||||
def judge_compute(results: dict) -> dict:
|
||||
"""Judge compute results against pass_thresholds_tflops.
|
||||
|
||||
Single source of truth for the PASS/WARN/FAIL rule (same one report.py uses):
|
||||
achieved >= thr -> PASS; >= 0.9*thr -> WARN; else FAIL. A string achieved value
|
||||
(skipped/error) -> SKIP. A dtype without a threshold falls back to efficiency
|
||||
(>=80 PASS / >=50 WARN / else FAIL).
|
||||
|
||||
Returns {"rows": [(dtype, achieved, threshold, status), ...], "verdict": str}.
|
||||
"""
|
||||
comp = results.get("compute", results)
|
||||
per_dtype = comp.get("per_dtype_tflops", {})
|
||||
thresholds = comp.get("pass_thresholds_tflops", {}) or {}
|
||||
eff = comp.get("efficiency_pct", {})
|
||||
rank = {"PASS": 0, "WARN": 1, "FAIL": 2, "SKIP": 0}
|
||||
rows, verdict = [], "PASS"
|
||||
for dt, val in per_dtype.items():
|
||||
thr = thresholds.get(dt)
|
||||
if isinstance(val, str):
|
||||
status = "SKIP"
|
||||
elif thr:
|
||||
status = "PASS" if val >= thr else ("WARN" if val >= thr * 0.9 else "FAIL")
|
||||
else:
|
||||
e = eff.get(dt, 0)
|
||||
status = "PASS" if e >= 80 else ("WARN" if e >= 50 else "FAIL")
|
||||
rows.append((dt, val, thr, status))
|
||||
if rank[status] > rank[verdict]:
|
||||
verdict = status
|
||||
return {"rows": rows, "verdict": verdict}
|
||||
|
||||
@staticmethod
|
||||
def print_compute_verdict(results: dict, console: Console = None) -> str:
|
||||
"""Print the PASS/WARN/FAIL table for compute results; return the verdict."""
|
||||
c = console or Console()
|
||||
judged = Benchmark.judge_compute(results)
|
||||
color = {"PASS": "green", "WARN": "yellow", "FAIL": "red", "SKIP": "dim"}
|
||||
c.print("\n[bold cyan]Compute Verdict (vs thresholds)[/bold cyan]")
|
||||
for dt, val, thr, status in judged["rows"]:
|
||||
val_s = f"{val:.1f}" if isinstance(val, (int, float)) else str(val)
|
||||
thr_s = f">= {thr}" if thr else "(efficiency)"
|
||||
c.print(f" {dt.upper():>4}: {val_s:>8} {thr_s:<12} [{color[status]}]{status}[/{color[status]}]")
|
||||
v = judged["verdict"]
|
||||
c.print(f" [bold]VERDICT: [{color[v]}]{v}[/{color[v]}][/bold]")
|
||||
return v
|
||||
|
||||
|
||||
def _run_cli() -> None:
|
||||
"""`python -m modules.benchmark` — run ONLY the compute-throughput benchmark."""
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
repo_root = Path(__file__).resolve().parent.parent
|
||||
parser = argparse.ArgumentParser(description="Run the compute-throughput benchmark only.")
|
||||
parser.add_argument("--config", default=str(repo_root / "configs" / "default.yaml"),
|
||||
help="path to config YAML (default: configs/default.yaml)")
|
||||
parser.add_argument("--json", action="store_true", help="also print raw JSON of the compute results")
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.config) as f:
|
||||
config = yaml.safe_load(f) or {}
|
||||
|
||||
results = Benchmark(config).run_compute_benchmark()
|
||||
Benchmark.print_results(results)
|
||||
Benchmark.print_compute_verdict(results)
|
||||
|
||||
if args.json:
|
||||
print("JSON_RESULT:" + json.dumps(results["compute"]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
_run_cli()
|
||||
consistency = comp.get("consistency", {})
|
||||
if consistency:
|
||||
t_cons = Table(title="Per-GPU Consistency", box=None, padding=(0, 1))
|
||||
t_cons.add_column("DType", style="bold")
|
||||
t_cons.add_column("Min", justify="right")
|
||||
t_cons.add_column("Mean", justify="right")
|
||||
t_cons.add_column("Max", justify="right")
|
||||
t_cons.add_column("Spread", justify="right")
|
||||
t_cons.add_column("Status", justify="right")
|
||||
for dt, row in consistency.items():
|
||||
status = "PASS" if row.get("passed") else "FAIL"
|
||||
color = "green" if row.get("passed") else "red"
|
||||
t_cons.add_row(
|
||||
dt.upper(),
|
||||
f"{row.get('min_tflops', 0):.1f}",
|
||||
f"{row.get('mean_tflops', 0):.1f}",
|
||||
f"{row.get('max_tflops', 0):.1f}",
|
||||
f"{row.get('spread_pct', 0):.2f}%",
|
||||
f"[{color}]{status}[/{color}]",
|
||||
)
|
||||
c.print(t_cons)
|
||||
|
||||
@ -11,7 +11,6 @@ GPU_NAME_PATTERNS = {
|
||||
"A100": "a100",
|
||||
"A800": "a800",
|
||||
"H100": "h100",
|
||||
"H800": "h800", # H800 = H100 SXM with NVLink halved (400 GB/s) and FP64 restricted
|
||||
"H200": "h200",
|
||||
"H20": "h20", # H20 / H20-3e is the China-compliance export variant, REDUCED peaks
|
||||
"B200": "b200",
|
||||
@ -36,17 +35,11 @@ GPU_SPECS = {
|
||||
"fp16_tflops": 990, # dense (1979 sparse w/ 2:4)
|
||||
"bf16_tflops": 990, # dense
|
||||
"fp8_tflops": 1979, # dense
|
||||
"fp64_tflops": 67,
|
||||
"int8_tflops": 1979,
|
||||
"compute_pass_thresholds_tflops": {
|
||||
# Recalibrated 2026-05-25 to the H100 eager-cuBLAS achievable floor (each
|
||||
# threshold ~2-4% below the sustained value measured across 16 GPUs via the
|
||||
# MAMF shape sweep: fp32 ~52 / tf32 ~405 / fp16 ~732-748 / bf16 ~747-758 /
|
||||
# fp8 ~1248-1271). The old marketing/MAMF-derived values (fp32 54, tf32 444,
|
||||
# fp16 734, bf16 745, fp8 1400) sat ON or ABOVE what PyTorch cuBLAS reaches
|
||||
# on H100, so healthy cards flaked to WARN/FAIL. fp8 1400 in particular was
|
||||
# an H200/rowwise-scaling figure; H100 tensorwise _scaled_mm tops out ~1310.
|
||||
"fp32": 50, "tf32": 385, "fp16": 720, "bf16": 730, "fp8": 1200,
|
||||
# FP64 63 / INT8 1536 — listed for documentation; benchmark module
|
||||
# doesn't currently exercise these dtypes.
|
||||
"fp32": 54, "tf32": 444, "fp16": 734, "bf16": 745, "fp8": 1400,
|
||||
"fp64": 63, "int8": 1536,
|
||||
},
|
||||
"tdp_watts": 700,
|
||||
"nvlink_gen": 4,
|
||||
@ -67,51 +60,10 @@ GPU_SPECS = {
|
||||
"fp16_tflops": 990, # dense
|
||||
"bf16_tflops": 990, # dense
|
||||
"fp8_tflops": 1979, # dense
|
||||
# PASS thresholds aligned with H200_production_acceptance.md v2 (2026-05-21):
|
||||
# calibrated against Semianalysis & stas00 MAMF — H200 shares H100 SMs so
|
||||
# achievable TFLOPS in PyTorch is in the same band.
|
||||
"compute_pass_thresholds_tflops": {
|
||||
"fp32": 50, "tf32": 400, "fp16": 720, "bf16": 720, "fp8": 1400,
|
||||
},
|
||||
"tdp_watts": 700,
|
||||
"nvlink_gen": 4,
|
||||
"nvlink_bandwidth_gbps": 900,
|
||||
"pcie_gen": 5,
|
||||
"min_driver_version": "545",
|
||||
"min_cuda_version": "12.4",
|
||||
},
|
||||
"h800": {
|
||||
# H800 = China-compliance export variant of H100 SXM5. SAME chip / SMs /
|
||||
# clocks / HBM as H100 SXM5 — Tensor Core peaks (FP16 / BF16 / FP8 / TF32 /
|
||||
# FP32) are identical to H100. Two restrictions vs H100:
|
||||
# 1. NVLink bandwidth halved: 400 GB/s bidirectional (vs H100 900 GB/s)
|
||||
# 2. FP64 throughput severely cut to ~1 TFLOPS (vs H100 34/67 TFLOPS)
|
||||
# All other interfaces (PCIe Gen5, NVSwitch, HBM3 80GB @ 3.35 TB/s) match H100.
|
||||
# NCCL multi-GPU thresholds MUST be downscaled because NVLink BW is halved.
|
||||
"full_name": "NVIDIA H800 SXM5",
|
||||
"architecture": "Hopper",
|
||||
"compute_capability": 9.0,
|
||||
"hbm_capacity_gb": 80,
|
||||
"hbm_type": "HBM3",
|
||||
"memory_bandwidth_gbps": 3350, # GB/s (3.35 TB/s) — same as H100 SXM
|
||||
"fp32_tflops": 67,
|
||||
"tf32_tflops": 495, # dense (same as H100)
|
||||
"fp16_tflops": 990, # dense (same as H100)
|
||||
"bf16_tflops": 990, # dense (same as H100)
|
||||
"fp8_tflops": 1979, # dense (same as H100)
|
||||
# Tensor Core peaks identical to H100, so PASS thresholds reuse the H100
|
||||
# eager-cuBLAS calibration (2026-05-25). Measured on 8×H800: fp32 ~52 /
|
||||
# tf32 ~420 / fp16 ~741 / bf16 ~745 / fp8 ~1249 — all clear these. fp8 was
|
||||
# 1400 (an H200/rowwise-scaling figure) which PyTorch tensorwise _scaled_mm
|
||||
# can't reach on H100-class silicon (~1310 ceiling); lowered to 1200 to match
|
||||
# h100. FP64 deliberately NOT listed — H800 is restricted to ~1 TFLOPS FP64.
|
||||
"compute_pass_thresholds_tflops": {
|
||||
"fp32": 50, "tf32": 385, "fp16": 720, "bf16": 730, "fp8": 1200,
|
||||
},
|
||||
"tdp_watts": 700,
|
||||
"nvlink_gen": 4,
|
||||
"nvlink_bandwidth_gbps": 400, # bidirectional — HALF of H100 (export restriction)
|
||||
"pcie_gen": 5,
|
||||
"min_driver_version": "535",
|
||||
"min_cuda_version": "12.1",
|
||||
},
|
||||
|
||||
@ -1,15 +1,17 @@
|
||||
"""RDMA / InfiniBand bandwidth and latency test module."""
|
||||
|
||||
import glob
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Optional, List
|
||||
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
from modules.gpu_specs import resolve_tools_dir
|
||||
|
||||
|
||||
class RDMATest:
|
||||
|
||||
@ -17,11 +19,24 @@ class RDMATest:
|
||||
self.config = config
|
||||
self.console = Console()
|
||||
self.rdma_cfg = config.get("rdma", {})
|
||||
self.tools_dir = resolve_tools_dir(config)
|
||||
|
||||
def _find_tool(self, name: str) -> Optional[str]:
|
||||
p = shutil.which(name)
|
||||
if p:
|
||||
return p
|
||||
candidates = [
|
||||
os.path.join(self.tools_dir, "perftest", name),
|
||||
os.path.join(self.tools_dir, "perftest", "bin", name),
|
||||
os.path.join(self.tools_dir, "rdma", name),
|
||||
os.path.join(self.tools_dir, name),
|
||||
]
|
||||
for path in candidates:
|
||||
if os.path.isfile(path) and os.access(path, os.X_OK):
|
||||
return path
|
||||
for path in glob.glob(os.path.join(self.tools_dir, "**", name), recursive=True):
|
||||
if os.path.isfile(path) and os.access(path, os.X_OK):
|
||||
return path
|
||||
return None
|
||||
|
||||
def _get_ib_devices(self) -> List[str]:
|
||||
@ -101,26 +116,40 @@ class RDMATest:
|
||||
|
||||
self.console.print(f"[cyan]RDMA Test - Devices: {', '.join(devices)}[/cyan]")
|
||||
|
||||
bw_results = self._run_bandwidth_tests(devices)
|
||||
latency_results = self._run_latency_tests(devices)
|
||||
active_pairs = [
|
||||
(dev, port) for dev, port in ib_devices
|
||||
if "ACTIVE" in self._read_sys(f"/sys/class/infiniband/{dev}/ports/{port}/state").upper()
|
||||
]
|
||||
port_checks = self._evaluate_port_checks(device_info)
|
||||
test_devices = [dev for dev, _ in active_pairs]
|
||||
bw_results = self._run_bandwidth_tests(test_devices)
|
||||
latency_results = self._run_latency_tests(test_devices)
|
||||
ibping_results = self._run_ibping_tests(active_pairs)
|
||||
fabric_counters = self._collect_pfc_ecn_counters() if self.rdma_cfg.get("pfc_ecn_counters", True) else {}
|
||||
failures = self._failure_reasons(port_checks, bw_results, latency_results, ibping_results, fabric_counters)
|
||||
|
||||
fabric_counters_missing = (
|
||||
self.rdma_cfg.get("pfc_ecn_counters", True)
|
||||
and fabric_counters
|
||||
and not fabric_counters.get("counters")
|
||||
)
|
||||
all_passed = all(
|
||||
r.get("status") == "PASS"
|
||||
for r in bw_results + latency_results
|
||||
for r in bw_results + latency_results + ibping_results
|
||||
if isinstance(r, dict)
|
||||
)
|
||||
) and all(p.get("status") == "PASS" for p in port_checks) and not fabric_counters.get("failed", False) and not fabric_counters_missing
|
||||
|
||||
result = {
|
||||
return {
|
||||
"passed": all_passed,
|
||||
"devices": device_info,
|
||||
"port_checks": port_checks,
|
||||
"bandwidth_tests": bw_results,
|
||||
"latency_tests": latency_results,
|
||||
"ibping_tests": ibping_results,
|
||||
"fabric_counters": fabric_counters,
|
||||
"failures": failures,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
# Cross-node (two-host) RDMA, run only when a peer is configured.
|
||||
if (self.rdma_cfg.get("cross_node", {}) or {}).get("enabled"):
|
||||
result["cross_node"] = self.run_cross_node()
|
||||
return result
|
||||
|
||||
def _collect_device_info(self, devices: List[str]) -> List[dict]:
|
||||
info = []
|
||||
@ -141,11 +170,83 @@ class RDMATest:
|
||||
port_info[label] = f.read().strip()
|
||||
except (FileNotFoundError, PermissionError):
|
||||
port_info[label] = "N/A"
|
||||
port_info["link_layer"] = self._read_sys(
|
||||
f"/sys/class/infiniband/{dev}/ports/{port}/link_layer"
|
||||
) or "N/A"
|
||||
|
||||
dev_info["ports"].append(port_info)
|
||||
info.append(dev_info)
|
||||
return info
|
||||
|
||||
def _evaluate_port_checks(self, device_info: List[dict]) -> List[dict]:
|
||||
checks = []
|
||||
min_rate = float(self.rdma_cfg.get("min_port_rate_gbps", 400))
|
||||
for dev in device_info:
|
||||
for port in dev.get("ports", []):
|
||||
if port.get("link_layer") != "InfiniBand":
|
||||
continue
|
||||
state = port.get("state", "")
|
||||
rate = port.get("rate", "")
|
||||
rate_gbps = self._parse_rate_gbps(rate)
|
||||
status = "PASS" if "ACTIVE" in state.upper() and rate_gbps >= min_rate else "FAIL"
|
||||
checks.append({
|
||||
"device": dev.get("name"),
|
||||
"port": port.get("port"),
|
||||
"state": state,
|
||||
"rate": rate,
|
||||
"rate_gbps": rate_gbps,
|
||||
"min_rate_gbps": min_rate,
|
||||
"status": status,
|
||||
})
|
||||
return checks
|
||||
|
||||
@staticmethod
|
||||
def _parse_rate_gbps(rate: str) -> float:
|
||||
# Example: "400 Gb/sec (4X NDR)"
|
||||
try:
|
||||
return float(str(rate).split()[0])
|
||||
except (ValueError, IndexError, AttributeError):
|
||||
return 0.0
|
||||
|
||||
@staticmethod
|
||||
def _failure_reasons(port_checks: List[dict], bw_results: List[dict],
|
||||
latency_results: List[dict], ibping_results: List[dict],
|
||||
fabric_counters: dict) -> List[str]:
|
||||
failures = []
|
||||
for p in port_checks:
|
||||
if p.get("status") != "PASS":
|
||||
failures.append(
|
||||
f"{p.get('device')} port {p.get('port')} state/rate failed "
|
||||
f"({p.get('state')}, {p.get('rate')}; required >= {p.get('min_rate_gbps')}Gbps ACTIVE)"
|
||||
)
|
||||
for r in bw_results:
|
||||
if r.get("status") != "PASS":
|
||||
if r.get("error"):
|
||||
failures.append(f"{r.get('test')} failed: {r.get('error')}")
|
||||
else:
|
||||
failures.append(
|
||||
f"{r.get('test')} bandwidth {r.get('bandwidth_gbps', 0)}GB/s "
|
||||
f"< {r.get('min_required_gbps', 'N/A')}GB/s"
|
||||
)
|
||||
for r in latency_results:
|
||||
if r.get("status") != "PASS":
|
||||
if r.get("error"):
|
||||
failures.append(f"{r.get('test')} failed: {r.get('error')}")
|
||||
else:
|
||||
failures.append(
|
||||
f"{r.get('test')} latency {r.get('latency_us', 0)}us "
|
||||
f"> {r.get('max_allowed_us', 'N/A')}us"
|
||||
)
|
||||
for r in ibping_results:
|
||||
if r.get("status") != "PASS":
|
||||
failures.append(f"{r.get('test')} failed: {r.get('error') or r.get('output_tail', '')[:120]}")
|
||||
if fabric_counters.get("failed"):
|
||||
nonzero = [f"{k}={v}" for k, v in fabric_counters.get("counters", {}).items() if v]
|
||||
failures.append("non-zero PFC/ECN/CNP/congestion counters: " + ", ".join(nonzero[:10]))
|
||||
elif fabric_counters and not fabric_counters.get("counters"):
|
||||
failures.append("PFC/ECN/CNP/congestion counters not found; fabric counter evidence missing")
|
||||
return failures
|
||||
|
||||
def _run_ib_command(self, cmd: List[str], timeout: int = 60) -> dict:
|
||||
try:
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
|
||||
@ -168,45 +269,70 @@ class RDMATest:
|
||||
iters = self.rdma_cfg.get("ib_iterations", 1000)
|
||||
dx = self.rdma_cfg.get("ib_device", None)
|
||||
port = self.rdma_cfg.get("ib_port", 1)
|
||||
server_addr = self.rdma_cfg.get("server_addr") or os.environ.get("RDMA_SERVER_ADDR")
|
||||
role = self.rdma_cfg.get("role", "auto")
|
||||
|
||||
for tool, label in [(ib_write_bw, "ib_write_bw"), (ib_read_bw, "ib_read_bw")]:
|
||||
if not tool:
|
||||
results.append({"test": label, "status": "SKIP", "error": "not installed"})
|
||||
results.append({"test": label, "status": "FAIL", "error": "not installed"})
|
||||
continue
|
||||
if role == "client" and not server_addr:
|
||||
results.append({
|
||||
"test": label,
|
||||
"status": "FAIL",
|
||||
"error": "rdma.role=client requires rdma.server_addr or RDMA_SERVER_ADDR",
|
||||
"role": "client",
|
||||
})
|
||||
continue
|
||||
|
||||
server_cmd = [tool, "-d", dx or devices[0], "-i", str(port), "-s", str(msg_size)]
|
||||
client_cmd = server_cmd + ["localhost"]
|
||||
server_cmd = [tool, "-d", dx or devices[0], "-i", str(port), "-s", str(msg_size), "-n", str(iters)]
|
||||
client_cmd = server_cmd + [server_addr or "localhost"]
|
||||
|
||||
server = subprocess.Popen(server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||
import time
|
||||
time.sleep(1)
|
||||
if role == "server":
|
||||
results.append(self._run_server_mode(label, server_cmd))
|
||||
continue
|
||||
|
||||
server = None
|
||||
if not server_addr and role != "client":
|
||||
server = subprocess.Popen(server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||
import time
|
||||
time.sleep(1)
|
||||
|
||||
try:
|
||||
client = subprocess.run(client_cmd, capture_output=True, text=True, timeout=60)
|
||||
server.wait(timeout=10)
|
||||
if server:
|
||||
server.wait(timeout=10)
|
||||
|
||||
output = client.stdout + server.stdout.read() if server.stdout else ""
|
||||
bw_mbps = 0
|
||||
output = client.stdout
|
||||
if server and server.stdout:
|
||||
output += server.stdout.read()
|
||||
bw_mibps = 0
|
||||
for line in output.split("\n"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
parts = line.split()
|
||||
try:
|
||||
bw_mbps = max(bw_mbps, float(parts[-1]))
|
||||
if len(parts) >= 5 and int(parts[0]) == int(msg_size):
|
||||
# perftest bandwidth rows:
|
||||
# #bytes #iterations BW peak[MiB/sec] BW average[MiB/sec] MsgRate[Mpps]
|
||||
bw_mibps = max(bw_mibps, float(parts[3]))
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
bw_gbps = bw_mbps / 1000 if bw_mbps else 0
|
||||
status = "PASS" if bw_gbps >= min_bw else "WARN"
|
||||
bw_gbps = bw_mibps * 1024 * 1024 / 1e9 if bw_mibps else 0
|
||||
status = "PASS" if bw_gbps >= min_bw else "FAIL"
|
||||
results.append({
|
||||
"test": label,
|
||||
"status": status,
|
||||
"bandwidth_gbps": round(bw_gbps, 2),
|
||||
"min_required_gbps": min_bw,
|
||||
"msg_size": msg_size,
|
||||
"role": "client" if server_addr else "local_loopback",
|
||||
})
|
||||
except Exception as e:
|
||||
server.kill()
|
||||
if server:
|
||||
server.kill()
|
||||
results.append({"test": label, "status": "FAIL", "error": str(e)})
|
||||
|
||||
return results
|
||||
@ -216,240 +342,214 @@ class RDMATest:
|
||||
ib_write_lat = self._find_tool("ib_write_lat")
|
||||
ib_read_lat = self._find_tool("ib_read_lat")
|
||||
max_lat_us = self.rdma_cfg.get("max_latency_us", 10)
|
||||
max_by_test = {
|
||||
"ib_write_lat": self.rdma_cfg.get("max_write_latency_us", max_lat_us),
|
||||
"ib_read_lat": self.rdma_cfg.get("max_read_latency_us", max_lat_us),
|
||||
}
|
||||
dx = self.rdma_cfg.get("ib_device", None)
|
||||
port = self.rdma_cfg.get("ib_port", 1)
|
||||
msg_size = self.rdma_cfg.get("latency_msg_size", 8)
|
||||
iters = self.rdma_cfg.get("ib_iterations", 1000)
|
||||
server_addr = self.rdma_cfg.get("server_addr") or os.environ.get("RDMA_SERVER_ADDR")
|
||||
role = self.rdma_cfg.get("role", "auto")
|
||||
|
||||
for tool, label in [(ib_write_lat, "ib_write_lat"), (ib_read_lat, "ib_read_lat")]:
|
||||
if not tool:
|
||||
results.append({"test": label, "status": "SKIP", "error": "not installed"})
|
||||
results.append({"test": label, "status": "FAIL", "error": "not installed"})
|
||||
continue
|
||||
if role == "client" and not server_addr:
|
||||
results.append({
|
||||
"test": label,
|
||||
"status": "FAIL",
|
||||
"error": "rdma.role=client requires rdma.server_addr or RDMA_SERVER_ADDR",
|
||||
"role": "client",
|
||||
})
|
||||
continue
|
||||
|
||||
server_cmd = [tool, "-d", dx or devices[0], "-i", str(port)]
|
||||
client_cmd = server_cmd + ["localhost"]
|
||||
server_cmd = [tool, "-d", dx or devices[0], "-i", str(port), "-s", str(msg_size), "-n", str(iters)]
|
||||
client_cmd = server_cmd + [server_addr or "localhost"]
|
||||
|
||||
server = subprocess.Popen(server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||
import time
|
||||
time.sleep(1)
|
||||
if role == "server":
|
||||
results.append(self._run_server_mode(label, server_cmd))
|
||||
continue
|
||||
|
||||
server = None
|
||||
if not server_addr and role != "client":
|
||||
server = subprocess.Popen(server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||
import time
|
||||
time.sleep(1)
|
||||
|
||||
try:
|
||||
client = subprocess.run(client_cmd, capture_output=True, text=True, timeout=60)
|
||||
server.wait(timeout=10)
|
||||
if server:
|
||||
server.wait(timeout=10)
|
||||
|
||||
output = client.stdout + server.stdout.read() if server.stdout else ""
|
||||
output = client.stdout
|
||||
if server and server.stdout:
|
||||
output += server.stdout.read()
|
||||
lat_us = 0
|
||||
for line in output.split("\n"):
|
||||
parts = line.strip().split()
|
||||
try:
|
||||
lat_us = max(lat_us, float(parts[-1]))
|
||||
if len(parts) >= 6:
|
||||
int(parts[0])
|
||||
int(parts[1])
|
||||
# perftest latency rows:
|
||||
# #bytes #iterations t_min t_max t_typical t_avg t_stdev p99 p99.9
|
||||
lat_us = max(lat_us, float(parts[5]))
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
status = "PASS" if 0 < lat_us <= max_lat_us else ("WARN" if lat_us > 0 else "FAIL")
|
||||
max_allowed = max_by_test[label]
|
||||
status = "PASS" if 0 < lat_us <= max_allowed else "FAIL"
|
||||
results.append({
|
||||
"test": label,
|
||||
"status": status,
|
||||
"latency_us": round(lat_us, 2),
|
||||
"max_allowed_us": max_lat_us,
|
||||
"max_allowed_us": max_allowed,
|
||||
"msg_size": msg_size,
|
||||
"role": "client" if server_addr else "local_loopback",
|
||||
})
|
||||
except Exception as e:
|
||||
server.kill()
|
||||
if server:
|
||||
server.kill()
|
||||
results.append({"test": label, "status": "FAIL", "error": str(e)})
|
||||
|
||||
return results
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Cross-node (two-host) RDMA over perftest, orchestrated via SSH.
|
||||
# Runs FROM the client host: for each IB device it launches the matching
|
||||
# perftest server on the peer over SSH (held open in a live ssh channel),
|
||||
# then runs the local client against the peer's OOB address and parses the
|
||||
# result. Replaces the old standalone scripts/rdma_cross_node.sh.
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _active_ib_devices(self) -> List[str]:
|
||||
"""IB devices whose port 1 is InfiniBand link_layer and ACTIVE."""
|
||||
out = []
|
||||
for dev in self._get_ib_devices():
|
||||
for port in self._get_ib_ports(dev):
|
||||
ll = self._read_sys(f"/sys/class/infiniband/{dev}/ports/{port}/link_layer")
|
||||
st = self._read_sys(f"/sys/class/infiniband/{dev}/ports/{port}/state")
|
||||
if ll == "InfiniBand" and "ACTIVE" in st.upper():
|
||||
out.append(dev)
|
||||
break
|
||||
return out
|
||||
|
||||
def run_cross_node(self) -> dict:
|
||||
cn = self.rdma_cfg.get("cross_node", {}) or {}
|
||||
if not cn.get("enabled"):
|
||||
return {"status": "SKIP", "skipped": True,
|
||||
"reason": "rdma.cross_node.enabled is false"}
|
||||
|
||||
server = cn.get("server")
|
||||
if not server:
|
||||
return {"status": "SKIP", "skipped": True,
|
||||
"reason": "rdma.cross_node.server (peer ssh address) not set"}
|
||||
|
||||
ssh_user = cn.get("ssh_user", "root")
|
||||
server_target = server if "@" in server else f"{ssh_user}@{server}"
|
||||
# OOB address the client's perftest connects to (defaults to the ssh host).
|
||||
server_addr = cn.get("server_addr") or server.split("@")[-1]
|
||||
ib_port = cn.get("ib_port", 1)
|
||||
gid_index = cn.get("gid_index")
|
||||
msg_size = cn.get("msg_size", 1048576)
|
||||
iters = cn.get("iters", 5000)
|
||||
base_port = cn.get("base_oob_port", 18515)
|
||||
warmup = cn.get("server_warmup_sec", 2.0)
|
||||
min_bw = cn.get("min_bandwidth_gbps", 350)
|
||||
max_lat = cn.get("max_latency_us", 5)
|
||||
|
||||
devices = cn.get("devices") or self._active_ib_devices()
|
||||
if not devices:
|
||||
return {"status": "SKIP", "skipped": True,
|
||||
"reason": "no active InfiniBand devices to test"}
|
||||
|
||||
has_bw = self._find_tool("ib_write_bw") is not None
|
||||
has_lat = self._find_tool("ib_write_lat") is not None
|
||||
if not has_bw and not has_lat:
|
||||
return {"status": "SKIP", "skipped": True,
|
||||
"reason": "perftest (ib_write_bw / ib_write_lat) not installed"}
|
||||
|
||||
self.console.print(
|
||||
f"[cyan]Cross-node RDMA — client → {server_addr}, "
|
||||
f"devices: {', '.join(devices)}[/cyan]")
|
||||
|
||||
per_device = []
|
||||
for idx, dev in enumerate(devices):
|
||||
oob = base_port + idx
|
||||
entry = {"device": dev}
|
||||
|
||||
if has_bw:
|
||||
bw = self._cross_node_perftest(
|
||||
"ib_write_bw", dev, server_target, server_addr, ib_port,
|
||||
oob, gid_index, warmup,
|
||||
extra=["--report_gbits", "-s", str(msg_size), "-n", str(iters)],
|
||||
parse="bw")
|
||||
entry["bandwidth_gbps"] = bw
|
||||
if isinstance(bw, (int, float)):
|
||||
entry["bw_status"] = "PASS" if bw >= min_bw else "WARN"
|
||||
else:
|
||||
entry["bw_status"] = "FAIL"
|
||||
|
||||
if has_lat:
|
||||
lat = self._cross_node_perftest(
|
||||
"ib_write_lat", dev, server_target, server_addr, ib_port,
|
||||
oob, gid_index, warmup, extra=[], parse="lat")
|
||||
if isinstance(lat, dict):
|
||||
entry["latency_us"] = lat.get("typical")
|
||||
entry["latency_p99_us"] = lat.get("p99")
|
||||
t = lat.get("typical")
|
||||
entry["lat_status"] = ("PASS" if isinstance(t, (int, float)) and 0 < t <= max_lat
|
||||
else ("WARN" if isinstance(t, (int, float)) else "FAIL"))
|
||||
else:
|
||||
entry["latency_us"] = lat
|
||||
entry["lat_status"] = "FAIL"
|
||||
|
||||
per_device.append(entry)
|
||||
|
||||
statuses = [e.get(k) for e in per_device for k in ("bw_status", "lat_status") if e.get(k)]
|
||||
verdict = "PASS"
|
||||
for s in statuses:
|
||||
if s == "FAIL":
|
||||
verdict = "FAIL"
|
||||
break
|
||||
if s == "WARN" and verdict == "PASS":
|
||||
verdict = "WARN"
|
||||
|
||||
return {
|
||||
"status": verdict,
|
||||
"server": server_addr,
|
||||
"min_bandwidth_gbps": min_bw,
|
||||
"max_latency_us": max_lat,
|
||||
"per_device": per_device,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
def _cross_node_perftest(self, tool: str, dev: str, server_target: str,
|
||||
server_addr: str, ib_port: int, oob_port: int,
|
||||
gid_index, warmup: float, extra: List[str], parse: str):
|
||||
"""Start `tool` server on the peer via SSH, run the local client, parse output.
|
||||
|
||||
Returns a float (bw, Gb/s), a dict {typical, p99} (lat, µs), or an error string.
|
||||
"""
|
||||
tool_path = self._find_tool(tool)
|
||||
if not tool_path:
|
||||
return f"{tool} not installed"
|
||||
|
||||
flags = ["-d", dev, "-i", str(ib_port), "-p", str(oob_port), "-F"]
|
||||
if gid_index is not None:
|
||||
flags += ["-x", str(gid_index)]
|
||||
flags += extra
|
||||
|
||||
server_cmd = " ".join([tool] + flags) # server: no host argument
|
||||
server_proc = None
|
||||
def _run_server_mode(self, label: str, server_cmd: List[str]) -> dict:
|
||||
timeout = int(self.rdma_cfg.get("server_timeout_sec", 120))
|
||||
try:
|
||||
server_proc = subprocess.Popen(
|
||||
["ssh", "-o", "BatchMode=yes", "-o", "StrictHostKeyChecking=no",
|
||||
server_target, server_cmd],
|
||||
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
|
||||
time.sleep(warmup) # let the remote server bind before the client connects
|
||||
|
||||
client = subprocess.run([tool_path] + flags + [server_addr],
|
||||
capture_output=True, text=True, timeout=120)
|
||||
out = client.stdout + "\n" + (client.stderr or "")
|
||||
return self._parse_perftest_lat(out) if parse == "lat" else self._parse_perftest_bw(out)
|
||||
r = subprocess.run(server_cmd, capture_output=True, text=True, timeout=timeout)
|
||||
return {
|
||||
"test": label,
|
||||
"status": "PASS" if r.returncode == 0 else "FAIL",
|
||||
"role": "server",
|
||||
"server_timeout_sec": timeout,
|
||||
"output_tail": (r.stdout + r.stderr)[-500:],
|
||||
}
|
||||
except subprocess.TimeoutExpired:
|
||||
return "timeout"
|
||||
except Exception as e: # noqa: BLE001
|
||||
return f"error: {e}"
|
||||
finally:
|
||||
if server_proc and server_proc.poll() is None:
|
||||
server_proc.terminate()
|
||||
return {
|
||||
"test": label,
|
||||
"status": "PASS",
|
||||
"role": "server",
|
||||
"server_timeout_sec": timeout,
|
||||
"note": "server ran until timeout waiting for client",
|
||||
}
|
||||
except Exception as e:
|
||||
return {"test": label, "status": "FAIL", "role": "server", "error": str(e)}
|
||||
|
||||
def _run_ibping_tests(self, active_pairs: List[tuple[str, str]]) -> List[dict]:
|
||||
tool = self._find_tool("ibping")
|
||||
if not tool:
|
||||
return [{"test": "ibping", "status": "FAIL", "error": "not installed"}]
|
||||
if not active_pairs:
|
||||
return [{"test": "ibping", "status": "FAIL", "error": "no active IB ports"}]
|
||||
|
||||
dev, port = active_pairs[0]
|
||||
target = (
|
||||
self.rdma_cfg.get("ibping_target")
|
||||
or os.environ.get("IBPING_TARGET")
|
||||
)
|
||||
count = int(self.rdma_cfg.get("ibping_count", 5))
|
||||
role = self.rdma_cfg.get("role", "auto")
|
||||
server_addr = self.rdma_cfg.get("server_addr") or os.environ.get("RDMA_SERVER_ADDR")
|
||||
base = [tool, "-C", dev, "-P", str(port)]
|
||||
|
||||
if role == "server":
|
||||
return [self._run_server_mode("ibping", [*base, "-S"])]
|
||||
|
||||
server = None
|
||||
if not target and role != "client":
|
||||
target = self._read_sys(f"/sys/class/infiniband/{dev}/ports/{port}/lid")
|
||||
server = subprocess.Popen([*base, "-S"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||
import time
|
||||
time.sleep(1)
|
||||
|
||||
if not target:
|
||||
reason = "no ibping target/lid"
|
||||
if role == "client" or server_addr:
|
||||
reason = (
|
||||
"cross-node ibping requires rdma.ibping_target or IBPING_TARGET "
|
||||
"(peer LID/GID; rdma.server_addr is only for perftest TCP bootstrap)"
|
||||
)
|
||||
return [{"test": "ibping", "status": "FAIL", "error": reason}]
|
||||
|
||||
try:
|
||||
r = subprocess.run([*base, "-c", str(count), str(target)], capture_output=True, text=True, timeout=30)
|
||||
if server:
|
||||
server.terminate()
|
||||
try:
|
||||
server_proc.wait(timeout=5)
|
||||
server.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
server.kill()
|
||||
output = r.stdout + r.stderr
|
||||
failed = r.returncode != 0 or "failed" in output.lower()
|
||||
return [{
|
||||
"test": "ibping",
|
||||
"status": "FAIL" if failed else "PASS",
|
||||
"role": "client" if server_addr or role == "client" else "local_loopback",
|
||||
"direction": "outbound_to_peer" if server_addr or role == "client" else "local_loopback",
|
||||
"target": str(target),
|
||||
"count": count,
|
||||
"output_tail": output[-500:],
|
||||
}]
|
||||
except Exception as e:
|
||||
if server:
|
||||
server.kill()
|
||||
return [{"test": "ibping", "status": "FAIL", "error": str(e)}]
|
||||
|
||||
def _collect_pfc_ecn_counters(self) -> dict:
|
||||
counters = {}
|
||||
failed = False
|
||||
keywords = ("pfc", "ecn", "cnp", "congestion")
|
||||
for root, _, files in os.walk("/sys/class/infiniband"):
|
||||
for name in files:
|
||||
lower = name.lower()
|
||||
if not any(k in lower for k in keywords):
|
||||
continue
|
||||
path = os.path.join(root, name)
|
||||
val = self._read_sys(path)
|
||||
try:
|
||||
num = int(val)
|
||||
except ValueError:
|
||||
continue
|
||||
rel = path.replace("/sys/class/infiniband/", "")
|
||||
counters[rel] = num
|
||||
if num != 0:
|
||||
failed = True
|
||||
|
||||
ethtool = shutil.which("ethtool")
|
||||
net_dir = "/sys/class/net"
|
||||
if ethtool and os.path.isdir(net_dir):
|
||||
for iface in sorted(os.listdir(net_dir)):
|
||||
try:
|
||||
r = subprocess.run(
|
||||
[ethtool, "-S", iface],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
except Exception:
|
||||
server_proc.kill()
|
||||
# ib_write_* server normally exits after one run; pkill cleans up a
|
||||
# leftover one if the client failed mid-handshake. -x matches the exact
|
||||
# process name so it never kills this ssh command itself.
|
||||
try:
|
||||
subprocess.run(
|
||||
["ssh", "-o", "BatchMode=yes", server_target, f"pkill -x {tool}"],
|
||||
capture_output=True, timeout=10)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def _parse_perftest_bw(output: str) -> float:
|
||||
"""Parse ib_write_bw rows (#bytes #iter BW_peak BW_avg ...); return max BW avg."""
|
||||
best = 0.0
|
||||
for line in output.splitlines():
|
||||
parts = line.split()
|
||||
if len(parts) >= 4:
|
||||
try:
|
||||
int(parts[0]) # #bytes column
|
||||
best = max(best, float(parts[3])) # BW average[Gb/sec]
|
||||
except ValueError:
|
||||
continue
|
||||
return round(best, 2) if best else 0.0
|
||||
|
||||
@staticmethod
|
||||
def _parse_perftest_lat(output: str) -> dict:
|
||||
"""Parse ib_write_lat row (#bytes #iter t_min t_max t_typical t_avg ... 99%)."""
|
||||
for line in output.splitlines():
|
||||
parts = line.split()
|
||||
if len(parts) >= 6:
|
||||
try:
|
||||
int(parts[0]); int(parts[1])
|
||||
typical = float(parts[4]) # t_typical[usec]
|
||||
except ValueError:
|
||||
if r.returncode != 0:
|
||||
continue
|
||||
p99 = None
|
||||
if len(parts) >= 8:
|
||||
for line in r.stdout.splitlines():
|
||||
if ":" not in line:
|
||||
continue
|
||||
key, value = line.split(":", 1)
|
||||
key = key.strip()
|
||||
lower = key.lower()
|
||||
if not any(k in lower for k in keywords):
|
||||
continue
|
||||
try:
|
||||
p99 = float(parts[7]) # 99% percentile[usec]
|
||||
except ValueError:
|
||||
p99 = None
|
||||
return {"typical": round(typical, 2), "p99": round(p99, 2) if p99 else None}
|
||||
return {"typical": None, "p99": None}
|
||||
num = int(value.strip().split()[0])
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
counters[f"net/{iface}/{key}"] = num
|
||||
if num != 0:
|
||||
failed = True
|
||||
return {"failed": failed, "counters": counters}
|
||||
|
||||
@staticmethod
|
||||
def print_results(results: dict, console: Console = None):
|
||||
@ -496,28 +596,10 @@ class RDMATest:
|
||||
f"({lat:.2f} us, max: {t.get('max_allowed_us', 'N/A')} us)" if status != "SKIP"
|
||||
else f" {t['test']}: [dim]SKIPPED[/dim]")
|
||||
|
||||
cn = results.get("cross_node")
|
||||
if cn:
|
||||
if cn.get("skipped"):
|
||||
c.print(f"\n [bold]Cross-node RDMA[/bold]: [dim]SKIPPED "
|
||||
f"({cn.get('reason', '')})[/dim]")
|
||||
else:
|
||||
v = cn.get("status", "?")
|
||||
vc = "green" if v == "PASS" else ("yellow" if v == "WARN" else "red")
|
||||
c.print(f"\n [bold]Cross-node RDMA[/bold] (server {cn.get('server')}) "
|
||||
f"[{vc}]{v}[/{vc}] "
|
||||
f"[dim]min {cn.get('min_bandwidth_gbps')} Gb/s, "
|
||||
f"max {cn.get('max_latency_us')} µs[/dim]")
|
||||
for e in cn.get("per_device", []):
|
||||
bw = e.get("bandwidth_gbps")
|
||||
lat = e.get("latency_us")
|
||||
bws = e.get("bw_status", "")
|
||||
lts = e.get("lat_status", "")
|
||||
bc = "green" if bws == "PASS" else ("yellow" if bws == "WARN" else "red")
|
||||
lc = "green" if lts == "PASS" else ("yellow" if lts == "WARN" else "red")
|
||||
bw_s = f"{bw:.1f} Gb/s" if isinstance(bw, (int, float)) else str(bw)
|
||||
lat_s = f"{lat:.2f} µs" if isinstance(lat, (int, float)) else str(lat)
|
||||
p99 = e.get("latency_p99_us")
|
||||
p99_s = f", p99 {p99:.2f}" if isinstance(p99, (int, float)) else ""
|
||||
c.print(f" {e['device']}: BW [{bc}]{bw_s}[/{bc}] | "
|
||||
f"lat [{lc}]{lat_s}[/{lc}]{p99_s}")
|
||||
ibping_tests = results.get("ibping_tests", [])
|
||||
if ibping_tests:
|
||||
c.print("\n [bold]IB Ping Tests[/bold]")
|
||||
for t in ibping_tests:
|
||||
status = t.get("status", "FAIL")
|
||||
sc = "green" if status == "PASS" else "red"
|
||||
c.print(f" {t['test']}: [{sc}]{status}[/{sc}] target={t.get('target', 'N/A')}")
|
||||
|
||||
@ -1,87 +0,0 @@
|
||||
# cuBLASLt FP8 GEMM Cross-Check Report
|
||||
|
||||
Date: 2026-05-24
|
||||
|
||||
Scope: Validate whether the single-node FP8 compute FAIL is caused by hardware/platform limits or by the original PyTorch `_scaled_mm` benchmark path.
|
||||
|
||||
## Method
|
||||
|
||||
Added a direct cuBLASLt FP8 GEMM micro-benchmark:
|
||||
|
||||
- Source: `scripts/cublaslt_fp8_gemm_bench.cu`
|
||||
- Wrapper: `scripts/run_cublaslt_fp8_gemm.sh`
|
||||
- Input dtype: `CUDA_R_8F_E4M3`
|
||||
- Output dtype: `CUDA_R_16BF`
|
||||
- Accumulate / compute type: `CUBLAS_COMPUTE_32F`
|
||||
- Layout: cuBLASLt FP8-required TN format
|
||||
- Matrix size: `8192`
|
||||
- Warmup: `50`
|
||||
- Iterations: `500`
|
||||
- GPUs: single-node 8 GPUs, measured one GPU at a time
|
||||
|
||||
NVIDIA cuBLASLt documentation states FP8 kernels require TN format, `CUBLAS_COMPUTE_32F`, and `CUDA_R_32F` scale type. The implemented benchmark follows those constraints.
|
||||
|
||||
## Results
|
||||
|
||||
### aikubeworker0012 / nccl-gpu-1
|
||||
|
||||
Raw report: `reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json`
|
||||
|
||||
| GPU | FP8 TFLOPS |
|
||||
|---:|---:|
|
||||
| 0 | 1615.6 |
|
||||
| 1 | 1611.0 |
|
||||
| 2 | 1599.0 |
|
||||
| 3 | 1607.1 |
|
||||
| 4 | 1614.0 |
|
||||
| 5 | 1604.4 |
|
||||
| 6 | 1608.4 |
|
||||
| 7 | 1609.1 |
|
||||
|
||||
Summary:
|
||||
|
||||
- Mean: `1608.6 TFLOPS`
|
||||
- Min / Max: `1599.0 / 1615.6 TFLOPS`
|
||||
- Spread: `1.03%`
|
||||
- FP8 absolute threshold: `>= 1400 TFLOPS`
|
||||
- Verdict against FP8 absolute threshold: **PASS**
|
||||
- Verdict against 8-GPU consistency threshold `<= 3%`: **PASS**
|
||||
|
||||
### aikubeworker0016 / nccl-gpu-2
|
||||
|
||||
Raw report: `reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json`
|
||||
|
||||
| GPU | FP8 TFLOPS |
|
||||
|---:|---:|
|
||||
| 0 | 1602.3 |
|
||||
| 1 | 1604.0 |
|
||||
| 2 | 1616.9 |
|
||||
| 3 | 1610.6 |
|
||||
| 4 | 1620.5 |
|
||||
| 5 | 1630.3 |
|
||||
| 6 | 1605.1 |
|
||||
| 7 | 1620.2 |
|
||||
|
||||
Summary:
|
||||
|
||||
- Mean: `1613.7 TFLOPS`
|
||||
- Min / Max: `1602.3 / 1630.3 TFLOPS`
|
||||
- Spread: `1.74%`
|
||||
- FP8 absolute threshold: `>= 1400 TFLOPS`
|
||||
- Verdict against FP8 absolute threshold: **PASS**
|
||||
- Verdict against 8-GPU consistency threshold `<= 3%`: **PASS**
|
||||
|
||||
## Comparison With Existing PyTorch `_scaled_mm` Result
|
||||
|
||||
| Host | PyTorch `_scaled_mm` FP8 | cuBLASLt FP8 | Delta |
|
||||
|---|---:|---:|---:|
|
||||
| aikubeworker0012 | 1170.4 | 1608.6 | +438.2 |
|
||||
| aikubeworker0016 | 1179.5 | 1613.7 | +434.2 |
|
||||
|
||||
The cuBLASLt path passes the `>= 1400 TFLOPS` FP8 absolute threshold on both machines, while the original PyTorch `_scaled_mm` path remains around `1170-1180 TFLOPS`.
|
||||
|
||||
## Conclusion
|
||||
|
||||
The FP8 hardware path is capable of exceeding the configured H100 FP8 acceptance threshold on both machines. The earlier FP8 FAIL is therefore most likely a benchmark implementation issue in the current PyTorch `_scaled_mm` path, not a GPU hardware, power, clock, thermal, MIG, ECC, or Fabric Manager issue.
|
||||
|
||||
Recommended next action: replace or augment the existing FP8 compute acceptance item with the cuBLASLt FP8 GEMM cross-check, while keeping the PyTorch `_scaled_mm` result as a secondary software-stack signal.
|
||||
@ -1,21 +0,0 @@
|
||||
{
|
||||
"source": "cuBLASLt",
|
||||
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
|
||||
"matrix_size": 8192,
|
||||
"warmup": 50,
|
||||
"iterations": 500,
|
||||
"per_gpu": [
|
||||
{"index": 0, "fp8_tflops": 1615.6},
|
||||
{"index": 1, "fp8_tflops": 1611.0},
|
||||
{"index": 2, "fp8_tflops": 1599.0},
|
||||
{"index": 3, "fp8_tflops": 1607.1},
|
||||
{"index": 4, "fp8_tflops": 1614.0},
|
||||
{"index": 5, "fp8_tflops": 1604.4},
|
||||
{"index": 6, "fp8_tflops": 1608.4},
|
||||
{"index": 7, "fp8_tflops": 1609.1}
|
||||
],
|
||||
"mean_tflops": 1608.6,
|
||||
"min_tflops": 1599.0,
|
||||
"max_tflops": 1615.6,
|
||||
"spread_pct": 1.03
|
||||
}
|
||||
@ -1,21 +0,0 @@
|
||||
{
|
||||
"source": "cuBLASLt",
|
||||
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
|
||||
"matrix_size": 8192,
|
||||
"warmup": 50,
|
||||
"iterations": 500,
|
||||
"per_gpu": [
|
||||
{"index": 0, "fp8_tflops": 1602.3},
|
||||
{"index": 1, "fp8_tflops": 1604.0},
|
||||
{"index": 2, "fp8_tflops": 1616.9},
|
||||
{"index": 3, "fp8_tflops": 1610.6},
|
||||
{"index": 4, "fp8_tflops": 1620.5},
|
||||
{"index": 5, "fp8_tflops": 1630.3},
|
||||
{"index": 6, "fp8_tflops": 1605.1},
|
||||
{"index": 7, "fp8_tflops": 1620.2}
|
||||
],
|
||||
"mean_tflops": 1613.7,
|
||||
"min_tflops": 1602.3,
|
||||
"max_tflops": 1630.3,
|
||||
"spread_pct": 1.74
|
||||
}
|
||||
@ -1,169 +0,0 @@
|
||||
# FP8 GEMM 路径对比测试报告
|
||||
|
||||
测试日期:2026-05-25
|
||||
测试节点:aikubeworker0012、aikubeworker0016
|
||||
测试 GPU:NVIDIA H100 80GB HBM3
|
||||
测试目标:对比同一 FP8 GEMM 规模下 PyTorch eager、CUDA Graph、Transformer Engine 和 direct cuBLASLt 的性能差异。
|
||||
|
||||
## 一、测试结论
|
||||
|
||||
本次 A-E 五条路径均已完成实测。
|
||||
|
||||
核心结论:
|
||||
|
||||
1. direct cuBLASLt 是本组测试里最快路径,两台机器分别达到 1626.6 TFLOPS 和 1598.1 TFLOPS。
|
||||
2. PyTorch eager `_scaled_mm` 默认路径约为 1161.9-1186.1 TFLOPS。
|
||||
3. 打开 `use_fast_accum=True` 后,PyTorch eager 路径有稳定提升,约提升 5.0%-6.7%。
|
||||
4. CUDA Graph + `_scaled_mm(use_fast_accum=True)` 进一步提升到 1277.7-1322.2 TFLOPS,但仍低于 direct cuBLASLt。
|
||||
5. Transformer Engine 本次使用的是 `te.Linear` + `fp8_autocast` 路径,不是裸 GEMM,因此包含 TE module、cast、FP8 recipe 等额外开销,结果低于 direct cuBLASLt,也低于 CUDA Graph `_scaled_mm`。
|
||||
|
||||
这说明:当前 GPU 硬件和 cuBLASLt 裸 GEMM 能力本身没有问题;之前 PyTorch `_scaled_mm` 1170-1180 TFLOPS 左右的结果,主要反映的是 PyTorch eager 路径和当前 benchmark 方式下的端到端路径性能,而不是 GPU 算力极限。
|
||||
|
||||
## 二、测试方法
|
||||
|
||||
统一参数:
|
||||
|
||||
| 参数 | 值 |
|
||||
|---|---:|
|
||||
| matrix_size | 8192 |
|
||||
| M/N/K | 8192/8192/8192 |
|
||||
| warmup | 50 |
|
||||
| iterations | 500 |
|
||||
| GPU index | 0 |
|
||||
| PyTorch | 2.6.0+cu124 |
|
||||
| CUDA | 12.4 |
|
||||
| 输入 dtype | FP8 E4M3 |
|
||||
| 输出 dtype | BF16 |
|
||||
| accumulation | FP32 |
|
||||
| scale_a / scale_b | 1.0 / 1.0 |
|
||||
|
||||
测试路径定义:
|
||||
|
||||
| 路径 | 名称 | 含义 |
|
||||
|---|---|---|
|
||||
| A | 当前 eager `_scaled_mm` | PyTorch 立即执行模式调用 `torch._scaled_mm`,默认 accumulation 参数 |
|
||||
| B | `_scaled_mm(use_fast_accum=True)` | PyTorch eager 路径,但显式打开 fast accumulation |
|
||||
| C | CUDA Graph + `_scaled_mm(use_fast_accum=True)` | 捕获并 replay 同一个 `_scaled_mm` 调用,降低 Python/PyTorch launch 间隙 |
|
||||
| D | Transformer Engine FP8 GEMM | `te.Linear` 在 `fp8_autocast` 下执行,包含 TE 层封装和 FP8 recipe 开销 |
|
||||
| E | direct cuBLASLt | C++/CUDA 直接调用 `cublasLtMatmul`,绕过 PyTorch eager |
|
||||
|
||||
复现脚本:
|
||||
|
||||
```bash
|
||||
MATRIX_SIZE=8192 WARMUP=50 ITERATIONS=500 GPU_INDEX=0 WORKSPACE_MB=256 \
|
||||
/root/test_gpu_scripts/scripts/run_fp8_path_comparison.sh
|
||||
```
|
||||
|
||||
## 三、实测结果
|
||||
|
||||
### aikubeworker0012
|
||||
|
||||
原始 JSON:`/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json`
|
||||
|
||||
| 路径 | 状态 | TFLOPS | 单轮 CUDA event 时间 |
|
||||
|---|---|---:|---:|
|
||||
| A eager `_scaled_mm` default | OK | 1186.1 | 927.014 us |
|
||||
| B eager `_scaled_mm` fast_accum | OK | 1266.0 | 868.481 us |
|
||||
| C CUDA Graph + fast_accum | OK | 1322.2 | 831.573 us |
|
||||
| D Transformer Engine FP8 Linear | OK | 1153.2 | 953.478 us |
|
||||
| E direct cuBLASLt fast_accum | OK | 1626.6 | 未在 combined JSON 中记录 |
|
||||
|
||||
相对 A 的提升:
|
||||
|
||||
| 路径 | 相对 A |
|
||||
|---|---:|
|
||||
| B | +6.7% |
|
||||
| C | +11.5% |
|
||||
| D | -2.8% |
|
||||
| E | +37.1% |
|
||||
|
||||
E 路径 cuBLASLt 算法信息:
|
||||
|
||||
| 字段 | 值 |
|
||||
|---|---:|
|
||||
| algo_id | 52 |
|
||||
| tile_id | 23 |
|
||||
| splitk | 1 |
|
||||
| stages_id | 36 |
|
||||
| inner_shape_id | 0 |
|
||||
| cluster_shape_id | 3 |
|
||||
|
||||
### aikubeworker0016
|
||||
|
||||
原始 JSON:`/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json`
|
||||
|
||||
| 路径 | 状态 | TFLOPS | 单轮 CUDA event 时间 |
|
||||
|---|---|---:|---:|
|
||||
| A eager `_scaled_mm` default | OK | 1161.9 | 946.313 us |
|
||||
| B eager `_scaled_mm` fast_accum | OK | 1220.4 | 900.960 us |
|
||||
| C CUDA Graph + fast_accum | OK | 1277.7 | 860.543 us |
|
||||
| D Transformer Engine FP8 Linear | OK | 1125.3 | 977.054 us |
|
||||
| E direct cuBLASLt fast_accum | OK | 1598.1 | 未在 combined JSON 中记录 |
|
||||
|
||||
相对 A 的提升:
|
||||
|
||||
| 路径 | 相对 A |
|
||||
|---|---:|
|
||||
| B | +5.0% |
|
||||
| C | +10.0% |
|
||||
| D | -3.2% |
|
||||
| E | +37.5% |
|
||||
|
||||
E 路径 cuBLASLt 算法信息:
|
||||
|
||||
| 字段 | 值 |
|
||||
|---|---:|
|
||||
| algo_id | 52 |
|
||||
| tile_id | 23 |
|
||||
| splitk | 1 |
|
||||
| stages_id | 36 |
|
||||
| inner_shape_id | 0 |
|
||||
| cluster_shape_id | 3 |
|
||||
|
||||
## 四、对 PyTorch FP8 能否“上去”的判断
|
||||
|
||||
从本次结果看,PyTorch FP8 路径可以通过两类方式上去:
|
||||
|
||||
1. 打开更快的 math/accumulation 参数,例如 `use_fast_accum=True`。
|
||||
2. 使用 CUDA Graph replay,减少 eager 模式下每轮调度、enqueue 之间的间隙。
|
||||
|
||||
但在当前 `matrix_size=8192`、单个 `_scaled_mm`、PyTorch eager/Graph benchmark 的测试形态下,PyTorch 路径仍没有达到 direct cuBLASLt 的 1598-1626 TFLOPS。也就是说,direct cuBLASLt 证明硬件和底层库有能力跑得更高;PyTorch eager `_scaled_mm` 测到的是 PyTorch 当前封装路径在这个 shape 下的实际表现。
|
||||
|
||||
如果把目标定义为“让 PyTorch 代码路径更接近裸 cuBLASLt”,后续可以继续验证:
|
||||
|
||||
1. 更大的 GEMM size,例如 16384。
|
||||
2. 固定 shape 后用 `torch.compile` 或 Inductor。
|
||||
3. CUDA Graph 覆盖更完整的 step,而不是只 replay 单个 op。
|
||||
4. 使用 Transformer Engine 的更底层 GEMM API 或官方 microbenchmark,而不是 `te.Linear` module forward。
|
||||
5. 对 `_scaled_mm` 做 Nsight Systems / Nsight Compute 抓取,确认实际 kernel、间隙和 cuBLASLt 算法选择。
|
||||
|
||||
## 五、术语说明
|
||||
|
||||
`eager` 指 PyTorch 立即执行模式。每次 Python 调用 `torch._scaled_mm`,PyTorch 都会经过 dispatcher、参数检查、Tensor 创建、准备 descriptor、调用 cuBLASLt heuristic,然后把 matmul enqueue 到 CUDA stream。
|
||||
|
||||
`cuBLAS` 是 NVIDIA 的基础矩阵乘库。`cuBLASLt` 是更灵活的矩阵乘接口,支持更多 layout、FP8、算法 heuristic、workspace、epilogue 等能力。
|
||||
|
||||
`direct cuBLASLt` 指我们自己写 C++/CUDA 直接调用 `cublasLtMatmul`,不经过 PyTorch eager,因此更接近裸 GEMM 峰值。
|
||||
|
||||
`CUDA Graph` 指把一次 CUDA work 提前捕获成图,后续直接 replay,减少 CPU 侧反复 launch/调度带来的间隙。
|
||||
|
||||
`Transformer Engine` 是 NVIDIA 面向 Transformer/FP8 训练优化的库。本次 D 路径使用的是 `te.Linear` module forward,不等同于裸 GEMM microbenchmark。
|
||||
|
||||
## 六、文件清单
|
||||
|
||||
本地脚本:
|
||||
|
||||
| 文件 | 用途 |
|
||||
|---|---|
|
||||
| `/Users/d-robotics/lab/test_gpu_scripts/scripts/pytorch_fp8_path_bench.py` | A/B/C/D PyTorch 与 Transformer Engine 路径 |
|
||||
| `/Users/d-robotics/lab/test_gpu_scripts/scripts/cublaslt_fp8_gemm_bench.cu` | E direct cuBLASLt 路径 |
|
||||
| `/Users/d-robotics/lab/test_gpu_scripts/scripts/run_fp8_path_comparison.sh` | 统一运行并合并 A-E 结果 |
|
||||
|
||||
本地结果:
|
||||
|
||||
| 文件 | 用途 |
|
||||
|---|---|
|
||||
| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json` | aikubeworker0012 A-E 原始结果 |
|
||||
| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json` | aikubeworker0016 A-E 原始结果 |
|
||||
| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_path_comparison_20260525.md` | 本中文汇总报告 |
|
||||
|
||||
@ -1,142 +0,0 @@
|
||||
{
|
||||
"source": "fp8_path_comparison",
|
||||
"host": null,
|
||||
"matrix_size": 8192,
|
||||
"gpu_index": 0,
|
||||
"pytorch": {
|
||||
"source": "pytorch_fp8_path_bench",
|
||||
"torch": "2.6.0+cu124",
|
||||
"cuda": "12.4",
|
||||
"gpu_index": 0,
|
||||
"gpu_name": "NVIDIA H100 80GB HBM3",
|
||||
"matrix_size": 8192,
|
||||
"warmup": 50,
|
||||
"iterations": 500,
|
||||
"results": [
|
||||
{
|
||||
"name": "A_eager_scaled_mm_default",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 465.145,
|
||||
"event_us_per_iter": 930.29,
|
||||
"wall_ms_total": 465.21,
|
||||
"tflops": 1181.9
|
||||
},
|
||||
{
|
||||
"name": "B_eager_scaled_mm_fast_accum",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 440.252,
|
||||
"event_us_per_iter": 880.504,
|
||||
"wall_ms_total": 440.289,
|
||||
"tflops": 1248.7
|
||||
},
|
||||
{
|
||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 3,
|
||||
"event_ms_total": 415.631,
|
||||
"event_us_per_iter": 831.262,
|
||||
"wall_ms_total": 415.664,
|
||||
"tflops": 1322.7
|
||||
},
|
||||
{
|
||||
"name": "D_transformer_engine_fp8_linear",
|
||||
"status": "unavailable",
|
||||
"reason": "ModuleNotFoundError: No module named 'transformer_engine'"
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"max_tflops": 1322.7,
|
||||
"min_tflops": 1181.9,
|
||||
"mean_tflops": 1251.1
|
||||
}
|
||||
},
|
||||
"cublaslt": {
|
||||
"source": "cuBLASLt",
|
||||
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
|
||||
"matrix_size": 8192,
|
||||
"warmup": 50,
|
||||
"iterations": 500,
|
||||
"fast_accum": 1,
|
||||
"per_gpu": [
|
||||
{
|
||||
"index": 0,
|
||||
"fp8_tflops": 1615.4,
|
||||
"algo_id": 52,
|
||||
"tile_id": 23,
|
||||
"splitk": 1,
|
||||
"stages_id": 36,
|
||||
"inner_shape_id": 0,
|
||||
"cluster_shape_id": 3
|
||||
}
|
||||
],
|
||||
"mean_tflops": 1615.4,
|
||||
"min_tflops": 1615.4,
|
||||
"max_tflops": 1615.4,
|
||||
"spread_pct": 0.0
|
||||
},
|
||||
"results": [
|
||||
{
|
||||
"name": "A_eager_scaled_mm_default",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 465.145,
|
||||
"event_us_per_iter": 930.29,
|
||||
"wall_ms_total": 465.21,
|
||||
"tflops": 1181.9
|
||||
},
|
||||
{
|
||||
"name": "B_eager_scaled_mm_fast_accum",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 440.252,
|
||||
"event_us_per_iter": 880.504,
|
||||
"wall_ms_total": 440.289,
|
||||
"tflops": 1248.7
|
||||
},
|
||||
{
|
||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 3,
|
||||
"event_ms_total": 415.631,
|
||||
"event_us_per_iter": 831.262,
|
||||
"wall_ms_total": 415.664,
|
||||
"tflops": 1322.7
|
||||
},
|
||||
{
|
||||
"name": "D_transformer_engine_fp8_linear",
|
||||
"status": "unavailable",
|
||||
"reason": "ModuleNotFoundError: No module named 'transformer_engine'"
|
||||
},
|
||||
{
|
||||
"index": 0,
|
||||
"algo_id": 52,
|
||||
"tile_id": 23,
|
||||
"splitk": 1,
|
||||
"stages_id": 36,
|
||||
"inner_shape_id": 0,
|
||||
"cluster_shape_id": 3,
|
||||
"name": "E_direct_cublaslt_fast_accum",
|
||||
"status": "ok",
|
||||
"tflops": 1615.4,
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"fast_accum": 1,
|
||||
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -1,156 +0,0 @@
|
||||
{
|
||||
"source": "fp8_path_comparison",
|
||||
"host": null,
|
||||
"matrix_size": 8192,
|
||||
"gpu_index": 0,
|
||||
"pytorch": {
|
||||
"source": "pytorch_fp8_path_bench",
|
||||
"torch": "2.6.0+cu124",
|
||||
"cuda": "12.4",
|
||||
"gpu_index": 0,
|
||||
"gpu_name": "NVIDIA H100 80GB HBM3",
|
||||
"matrix_size": 8192,
|
||||
"warmup": 50,
|
||||
"iterations": 500,
|
||||
"results": [
|
||||
{
|
||||
"name": "A_eager_scaled_mm_default",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 463.507,
|
||||
"event_us_per_iter": 927.014,
|
||||
"wall_ms_total": 463.573,
|
||||
"tflops": 1186.1
|
||||
},
|
||||
{
|
||||
"name": "B_eager_scaled_mm_fast_accum",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 434.241,
|
||||
"event_us_per_iter": 868.481,
|
||||
"wall_ms_total": 434.492,
|
||||
"tflops": 1266.0
|
||||
},
|
||||
{
|
||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 3,
|
||||
"event_ms_total": 415.786,
|
||||
"event_us_per_iter": 831.573,
|
||||
"wall_ms_total": 415.825,
|
||||
"tflops": 1322.2
|
||||
},
|
||||
{
|
||||
"name": "D_transformer_engine_fp8_linear",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 476.739,
|
||||
"event_us_per_iter": 953.478,
|
||||
"wall_ms_total": 476.8,
|
||||
"tflops": 1153.2,
|
||||
"note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"max_tflops": 1322.2,
|
||||
"min_tflops": 1153.2,
|
||||
"mean_tflops": 1231.9
|
||||
}
|
||||
},
|
||||
"cublaslt": {
|
||||
"source": "cuBLASLt",
|
||||
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
|
||||
"matrix_size": 8192,
|
||||
"warmup": 50,
|
||||
"iterations": 500,
|
||||
"fast_accum": 1,
|
||||
"per_gpu": [
|
||||
{
|
||||
"index": 0,
|
||||
"fp8_tflops": 1626.6,
|
||||
"algo_id": 52,
|
||||
"tile_id": 23,
|
||||
"splitk": 1,
|
||||
"stages_id": 36,
|
||||
"inner_shape_id": 0,
|
||||
"cluster_shape_id": 3
|
||||
}
|
||||
],
|
||||
"mean_tflops": 1626.6,
|
||||
"min_tflops": 1626.6,
|
||||
"max_tflops": 1626.6,
|
||||
"spread_pct": 0.0
|
||||
},
|
||||
"results": [
|
||||
{
|
||||
"name": "A_eager_scaled_mm_default",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 463.507,
|
||||
"event_us_per_iter": 927.014,
|
||||
"wall_ms_total": 463.573,
|
||||
"tflops": 1186.1
|
||||
},
|
||||
{
|
||||
"name": "B_eager_scaled_mm_fast_accum",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 434.241,
|
||||
"event_us_per_iter": 868.481,
|
||||
"wall_ms_total": 434.492,
|
||||
"tflops": 1266.0
|
||||
},
|
||||
{
|
||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 3,
|
||||
"event_ms_total": 415.786,
|
||||
"event_us_per_iter": 831.573,
|
||||
"wall_ms_total": 415.825,
|
||||
"tflops": 1322.2
|
||||
},
|
||||
{
|
||||
"name": "D_transformer_engine_fp8_linear",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 476.739,
|
||||
"event_us_per_iter": 953.478,
|
||||
"wall_ms_total": 476.8,
|
||||
"tflops": 1153.2,
|
||||
"note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
|
||||
},
|
||||
{
|
||||
"index": 0,
|
||||
"algo_id": 52,
|
||||
"tile_id": 23,
|
||||
"splitk": 1,
|
||||
"stages_id": 36,
|
||||
"inner_shape_id": 0,
|
||||
"cluster_shape_id": 3,
|
||||
"name": "E_direct_cublaslt_fast_accum",
|
||||
"status": "ok",
|
||||
"tflops": 1626.6,
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"fast_accum": 1,
|
||||
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -1,142 +0,0 @@
|
||||
{
|
||||
"source": "fp8_path_comparison",
|
||||
"host": null,
|
||||
"matrix_size": 8192,
|
||||
"gpu_index": 0,
|
||||
"pytorch": {
|
||||
"source": "pytorch_fp8_path_bench",
|
||||
"torch": "2.6.0+cu124",
|
||||
"cuda": "12.4",
|
||||
"gpu_index": 0,
|
||||
"gpu_name": "NVIDIA H100 80GB HBM3",
|
||||
"matrix_size": 8192,
|
||||
"warmup": 50,
|
||||
"iterations": 500,
|
||||
"results": [
|
||||
{
|
||||
"name": "A_eager_scaled_mm_default",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 470.909,
|
||||
"event_us_per_iter": 941.817,
|
||||
"wall_ms_total": 470.974,
|
||||
"tflops": 1167.4
|
||||
},
|
||||
{
|
||||
"name": "B_eager_scaled_mm_fast_accum",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 452.608,
|
||||
"event_us_per_iter": 905.215,
|
||||
"wall_ms_total": 452.647,
|
||||
"tflops": 1214.6
|
||||
},
|
||||
{
|
||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 3,
|
||||
"event_ms_total": 427.724,
|
||||
"event_us_per_iter": 855.449,
|
||||
"wall_ms_total": 427.768,
|
||||
"tflops": 1285.3
|
||||
},
|
||||
{
|
||||
"name": "D_transformer_engine_fp8_linear",
|
||||
"status": "unavailable",
|
||||
"reason": "ModuleNotFoundError: No module named 'transformer_engine'"
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"max_tflops": 1285.3,
|
||||
"min_tflops": 1167.4,
|
||||
"mean_tflops": 1222.4
|
||||
}
|
||||
},
|
||||
"cublaslt": {
|
||||
"source": "cuBLASLt",
|
||||
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
|
||||
"matrix_size": 8192,
|
||||
"warmup": 50,
|
||||
"iterations": 500,
|
||||
"fast_accum": 1,
|
||||
"per_gpu": [
|
||||
{
|
||||
"index": 0,
|
||||
"fp8_tflops": 1594.3,
|
||||
"algo_id": 52,
|
||||
"tile_id": 23,
|
||||
"splitk": 1,
|
||||
"stages_id": 36,
|
||||
"inner_shape_id": 0,
|
||||
"cluster_shape_id": 3
|
||||
}
|
||||
],
|
||||
"mean_tflops": 1594.3,
|
||||
"min_tflops": 1594.3,
|
||||
"max_tflops": 1594.3,
|
||||
"spread_pct": 0.0
|
||||
},
|
||||
"results": [
|
||||
{
|
||||
"name": "A_eager_scaled_mm_default",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 470.909,
|
||||
"event_us_per_iter": 941.817,
|
||||
"wall_ms_total": 470.974,
|
||||
"tflops": 1167.4
|
||||
},
|
||||
{
|
||||
"name": "B_eager_scaled_mm_fast_accum",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 452.608,
|
||||
"event_us_per_iter": 905.215,
|
||||
"wall_ms_total": 452.647,
|
||||
"tflops": 1214.6
|
||||
},
|
||||
{
|
||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 3,
|
||||
"event_ms_total": 427.724,
|
||||
"event_us_per_iter": 855.449,
|
||||
"wall_ms_total": 427.768,
|
||||
"tflops": 1285.3
|
||||
},
|
||||
{
|
||||
"name": "D_transformer_engine_fp8_linear",
|
||||
"status": "unavailable",
|
||||
"reason": "ModuleNotFoundError: No module named 'transformer_engine'"
|
||||
},
|
||||
{
|
||||
"index": 0,
|
||||
"algo_id": 52,
|
||||
"tile_id": 23,
|
||||
"splitk": 1,
|
||||
"stages_id": 36,
|
||||
"inner_shape_id": 0,
|
||||
"cluster_shape_id": 3,
|
||||
"name": "E_direct_cublaslt_fast_accum",
|
||||
"status": "ok",
|
||||
"tflops": 1594.3,
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"fast_accum": 1,
|
||||
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -1,156 +0,0 @@
|
||||
{
|
||||
"source": "fp8_path_comparison",
|
||||
"host": null,
|
||||
"matrix_size": 8192,
|
||||
"gpu_index": 0,
|
||||
"pytorch": {
|
||||
"source": "pytorch_fp8_path_bench",
|
||||
"torch": "2.6.0+cu124",
|
||||
"cuda": "12.4",
|
||||
"gpu_index": 0,
|
||||
"gpu_name": "NVIDIA H100 80GB HBM3",
|
||||
"matrix_size": 8192,
|
||||
"warmup": 50,
|
||||
"iterations": 500,
|
||||
"results": [
|
||||
{
|
||||
"name": "A_eager_scaled_mm_default",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 473.156,
|
||||
"event_us_per_iter": 946.313,
|
||||
"wall_ms_total": 473.199,
|
||||
"tflops": 1161.9
|
||||
},
|
||||
{
|
||||
"name": "B_eager_scaled_mm_fast_accum",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 450.48,
|
||||
"event_us_per_iter": 900.96,
|
||||
"wall_ms_total": 450.505,
|
||||
"tflops": 1220.4
|
||||
},
|
||||
{
|
||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 3,
|
||||
"event_ms_total": 430.272,
|
||||
"event_us_per_iter": 860.543,
|
||||
"wall_ms_total": 430.304,
|
||||
"tflops": 1277.7
|
||||
},
|
||||
{
|
||||
"name": "D_transformer_engine_fp8_linear",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 488.527,
|
||||
"event_us_per_iter": 977.054,
|
||||
"wall_ms_total": 488.576,
|
||||
"tflops": 1125.3,
|
||||
"note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"max_tflops": 1277.7,
|
||||
"min_tflops": 1125.3,
|
||||
"mean_tflops": 1196.3
|
||||
}
|
||||
},
|
||||
"cublaslt": {
|
||||
"source": "cuBLASLt",
|
||||
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
|
||||
"matrix_size": 8192,
|
||||
"warmup": 50,
|
||||
"iterations": 500,
|
||||
"fast_accum": 1,
|
||||
"per_gpu": [
|
||||
{
|
||||
"index": 0,
|
||||
"fp8_tflops": 1598.1,
|
||||
"algo_id": 52,
|
||||
"tile_id": 23,
|
||||
"splitk": 1,
|
||||
"stages_id": 36,
|
||||
"inner_shape_id": 0,
|
||||
"cluster_shape_id": 3
|
||||
}
|
||||
],
|
||||
"mean_tflops": 1598.1,
|
||||
"min_tflops": 1598.1,
|
||||
"max_tflops": 1598.1,
|
||||
"spread_pct": 0.0
|
||||
},
|
||||
"results": [
|
||||
{
|
||||
"name": "A_eager_scaled_mm_default",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 473.156,
|
||||
"event_us_per_iter": 946.313,
|
||||
"wall_ms_total": 473.199,
|
||||
"tflops": 1161.9
|
||||
},
|
||||
{
|
||||
"name": "B_eager_scaled_mm_fast_accum",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 450.48,
|
||||
"event_us_per_iter": 900.96,
|
||||
"wall_ms_total": 450.505,
|
||||
"tflops": 1220.4
|
||||
},
|
||||
{
|
||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 3,
|
||||
"event_ms_total": 430.272,
|
||||
"event_us_per_iter": 860.543,
|
||||
"wall_ms_total": 430.304,
|
||||
"tflops": 1277.7
|
||||
},
|
||||
{
|
||||
"name": "D_transformer_engine_fp8_linear",
|
||||
"status": "ok",
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"event_ms_total": 488.527,
|
||||
"event_us_per_iter": 977.054,
|
||||
"wall_ms_total": 488.576,
|
||||
"tflops": 1125.3,
|
||||
"note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
|
||||
},
|
||||
{
|
||||
"index": 0,
|
||||
"algo_id": 52,
|
||||
"tile_id": 23,
|
||||
"splitk": 1,
|
||||
"stages_id": 36,
|
||||
"inner_shape_id": 0,
|
||||
"cluster_shape_id": 3,
|
||||
"name": "E_direct_cublaslt_fast_accum",
|
||||
"status": "ok",
|
||||
"tflops": 1598.1,
|
||||
"matrix_size": 8192,
|
||||
"iterations": 500,
|
||||
"warmup": 50,
|
||||
"fast_accum": 1,
|
||||
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -1,152 +0,0 @@
|
||||
# GPU_Test 合并报告
|
||||
|
||||
- **日期:** 2026-05-24
|
||||
- **节点:** `aikubeworker0012 / 172.72.8.12`,`aikubeworker0016 / 172.72.8.16`
|
||||
- **GPU:** NVIDIA H100 80GB HBM3 x8 / node
|
||||
- **范围:** 单机单卡算力与多机多卡 NCCL 通信
|
||||
- **说明:** 本报告汇总既有原始测试结果,不重新启动额外压力测试。
|
||||
|
||||
## 总体结论
|
||||
|
||||
| 测试项 | 结论 | 说明 |
|
||||
|---|---|---|
|
||||
| 单机 GPU 识别 | PASS | 两台机器均识别 8 张 H100 80GB HBM3 |
|
||||
| 单机单卡 FP8 硬件算力 | PASS | direct cuBLASLt FP8 GEMM 两台机器均超过 `>= 1400 TFLOPS` |
|
||||
| PyTorch `_scaled_mm` FP8 路径 | FAIL / 软件栈信号 | 约 `1170-1180 TFLOPS`,低于阈值;已定位为 PyTorch eager / `_scaled_mm` benchmark 路径偏低,不作为硬件失败依据 |
|
||||
| 多机多卡 NCCL 正确性 | PASS | return code `0`,`Wrong=0` / `Out of bounds values: 0 OK` |
|
||||
| 多机多卡 NCCL 性能 | 符合当前 4x400Gbps 网络形态 | 2x8 allreduce / alltoall 低于 PDF 8x400Gbps 阈值,但该阈值不应直接硬套到当前 4x400Gbps 环境 |
|
||||
|
||||
## 单机单卡 / 算力测试
|
||||
|
||||
### 机器信息
|
||||
|
||||
| Host | GPU | Driver | CUDA | GPU 数量 |
|
||||
|---|---|---|---|---:|
|
||||
| `aikubeworker0012` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
|
||||
| `aikubeworker0016` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
|
||||
|
||||
来源:
|
||||
|
||||
- `reports_single_gpu_aikubeworker0012.md`
|
||||
- `reports_single_gpu_aikubeworker0016.md`
|
||||
|
||||
### 原始 PyTorch 单机算力结果
|
||||
|
||||
| Host | FP32 | TF32 | FP16 | BF16 | FP8 `_scaled_mm` | 原始 Verdict |
|
||||
|---|---:|---:|---:|---:|---:|---|
|
||||
| `aikubeworker0012` | 52.0 | 362.3 | 691.0 | 713.0 | 1148.8 | FAIL |
|
||||
| `aikubeworker0016` | 51.9 | 357.8 | 667.2 | 699.1 | 1146.2 | FAIL |
|
||||
|
||||
原始 PyTorch 路径使用 `torch._scaled_mm` 做 FP8 GEMM。后续复查显示,该路径会受到 PyTorch eager dispatch、输出 Tensor 创建、cuBLASLt heuristic 路径、默认 `use_fast_accum=False` 等因素影响,不能直接代表 H100 FP8 Tensor Core 硬件上限。
|
||||
|
||||
### direct cuBLASLt FP8 GEMM 交叉验证
|
||||
|
||||
测试参数:
|
||||
|
||||
| 参数 | 值 |
|
||||
|---|---|
|
||||
| Benchmark | direct cuBLASLt FP8 GEMM |
|
||||
| Source | `scripts/cublaslt_fp8_gemm_bench.cu` |
|
||||
| Matrix | `8192 x 8192 x 8192` |
|
||||
| A/B dtype | FP8 E4M3 |
|
||||
| Output dtype | BF16 |
|
||||
| Compute type | `CUBLAS_COMPUTE_32F` |
|
||||
| Scale type | `CUDA_R_32F` |
|
||||
| Scale A/B | `1.0` |
|
||||
| Layout | TN |
|
||||
| fast accumulation | enabled |
|
||||
| Threshold | `>= 1400 TFLOPS` |
|
||||
|
||||
结果:
|
||||
|
||||
| Host | Mean FP8 TFLOPS | Min | Max | Spread | Threshold | Verdict |
|
||||
|---|---:|---:|---:|---:|---:|---|
|
||||
| `aikubeworker0012` | 1608.6 | 1599.0 | 1615.6 | 1.03% | >= 1400 | PASS |
|
||||
| `aikubeworker0016` | 1613.7 | 1602.3 | 1630.3 | 1.74% | >= 1400 | PASS |
|
||||
|
||||
单卡逐张结果:
|
||||
|
||||
| Host | GPU0 | GPU1 | GPU2 | GPU3 | GPU4 | GPU5 | GPU6 | GPU7 |
|
||||
|---|---:|---:|---:|---:|---:|---:|---:|---:|
|
||||
| `aikubeworker0012` | 1615.6 | 1611.0 | 1599.0 | 1607.1 | 1614.0 | 1604.4 | 1608.4 | 1609.1 |
|
||||
| `aikubeworker0016` | 1602.3 | 1604.0 | 1616.9 | 1610.6 | 1620.5 | 1630.3 | 1605.1 | 1620.2 |
|
||||
|
||||
结论:direct cuBLASLt FP8 GEMM 已通过 `>= 1400 TFLOPS` 阈值,说明两台机器的 FP8 硬件计算路径具备达标能力。PyTorch `_scaled_mm` 的 FAIL 更适合作为软件栈 benchmark 路径问题记录,而不是 GPU 硬件失败结论。
|
||||
|
||||
来源:
|
||||
|
||||
- `reports_cublaslt_fp8_crosscheck_20260524.md`
|
||||
- `reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json`
|
||||
- `reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json`
|
||||
|
||||
## 多机多卡 NCCL 测试
|
||||
|
||||
### 测试环境
|
||||
|
||||
| 项目 | 结果 |
|
||||
|---|---|
|
||||
| Hosts | `nccl-gpu-1(172.72.8.12)`,`nccl-gpu-2(172.72.8.16)` |
|
||||
| Topology | 2 nodes x 8 GPUs,合计 16 GPUs |
|
||||
| NCCL source | `nccl-tests-mpirun` |
|
||||
| NCCL network | IB |
|
||||
| GPU Direct RDMA | ENABLED |
|
||||
| Active HCA rails | `mlx5_0, mlx5_1, mlx5_6, mlx5_7` |
|
||||
| HCA speed | 4 条 `400 Gb/sec (4X NDR)` ACTIVE |
|
||||
|
||||
注意:NCCL 表里的 `GB/s` 是大 B,即 Bytes/s。IB 网卡口径 `400 Gb/s` 是小 b,即 bits/s。
|
||||
|
||||
### 2x8 全集合通信结果
|
||||
|
||||
| Operation | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Correctness | 当前 4x400Gbps 口径 |
|
||||
|---|---:|---:|---:|---|---|
|
||||
| allreduce | 354.27 GB/s | 354.45 GB/s | >= 491.84 GB/s | PASS | 符合当前硬件形态,低于 PDF 8 rail 阈值 |
|
||||
| alltoall | 37.00 GB/s | 37.14 GB/s | >= 76.54 GB/s | PASS | 符合当前硬件形态,低于 PDF 8 rail 阈值 |
|
||||
| broadcast | 191.65 GB/s | 190.25 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
|
||||
| reducescatter | 192.75 GB/s | 192.74 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
|
||||
| allgather | 192.14 GB/s | 192.47 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
|
||||
| sendrecv | 26.98 GB/s | 26.97 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
|
||||
|
||||
结论:2x8 全集合通信测试中,NCCL 正确性通过。allreduce 和 alltoall 低于 PDF 8x400Gbps 参考阈值,但当前机器确认参与 NCCL 的是 4 条 400Gbps rail,因此该差距不应直接判定为当前 4x400Gbps 环境不合格。
|
||||
|
||||
来源:
|
||||
|
||||
- `reports_multinode_nccl_all_collectives_20260523_120144.md`
|
||||
- `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md`
|
||||
|
||||
### PDF Matrix allreduce / alltoall 结果
|
||||
|
||||
AllReduce(PDF 8x400Gbps 阈值对比,仅作参考):
|
||||
|
||||
| Topology | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Gap | 当前解释 |
|
||||
|---|---:|---:|---:|---:|---|
|
||||
| 2 nodes x 1 GPU | 47.29 GB/s | 47.26 GB/s | >= 48.90 GB/s | -1.61 GB/s | 接近 PDF 阈值 |
|
||||
| 2 nodes x 2 GPUs | 137.16 GB/s | 137.13 GB/s | >= 136.93 GB/s | +0.23 GB/s | 达到 PDF 阈值 |
|
||||
| 2 nodes x 4 GPUs | 335.07 GB/s | 335.02 GB/s | >= 335.48 GB/s | -0.41 GB/s | 接近 PDF 阈值 |
|
||||
| 2 nodes x 8 GPUs | 353.85 GB/s | 353.85 GB/s | >= 491.84 GB/s | -137.99 GB/s | 低于 PDF 8 rail 阈值;当前为 4 rail 环境,不直接判不合格 |
|
||||
|
||||
AllToAll(PDF 8x400Gbps 阈值对比,仅作参考):
|
||||
|
||||
| Topology | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Gap | 当前解释 |
|
||||
|---|---:|---:|---:|---:|---|
|
||||
| 2 nodes x 1 GPU | 24.85 GB/s | 24.90 GB/s | >= 27.25 GB/s | -2.40 GB/s | 接近 PDF 阈值 |
|
||||
| 2 nodes x 2 GPUs | 47.76 GB/s | 47.98 GB/s | >= 54.41 GB/s | -6.65 GB/s | 低于 PDF 8 rail 阈值 |
|
||||
| 2 nodes x 4 GPUs | 72.74 GB/s | 72.80 GB/s | >= 73.73 GB/s | -0.99 GB/s | 接近 PDF 阈值 |
|
||||
| 2 nodes x 8 GPUs | 36.83 GB/s | 36.85 GB/s | >= 76.54 GB/s | -39.71 GB/s | 低于 PDF 8 rail 阈值;当前为 4 rail 环境,不直接判不合格 |
|
||||
|
||||
来源:
|
||||
|
||||
- `reports_multinode_nccl_pdf_matrix_run_20260523.md`
|
||||
- `reports_multinode_nccl_pdf_matrix_20260523_113803.md`
|
||||
|
||||
## 风险与判断
|
||||
|
||||
1. 单机 FP8 硬件能力通过 direct cuBLASLt 验证,当前不支持将 PyTorch `_scaled_mm` FAIL 直接判定为 GPU 硬件故障。
|
||||
2. 多机 NCCL 正确性通过,性能结果应按当前 4x400Gbps rail 环境解释。
|
||||
3. 当前多机环境确认参与 NCCL 的是 4 条 400G IB rail;PDF 参考环境为 8x400G 计算管理网络,因此 2x8 阈值与当前硬件形态不等价。
|
||||
4. 2x8 allreduce 和 alltoall 低于 PDF 8 rail 阈值,建议作为“与 PDF 参考环境差异”记录,而不是作为当前 4 rail 环境不合格结论。
|
||||
|
||||
## 建议
|
||||
|
||||
1. 单机 FP8 验收以 direct cuBLASLt 或 Transformer Engine GEMM benchmark 为主,PyTorch `_scaled_mm` 作为软件栈参考项保留。
|
||||
2. 多机 NCCL 后续若要按 PDF 阈值验收,需要先对齐 PDF 参考环境的 8x400Gbps rail 数量、NCCL net plugin / SHARP、跨 Leaf 交换策略、ECMP / 拥塞控制配置。
|
||||
3. 对外报告建议明确区分 `GB/s` 与 `Gb/s`:NCCL bus bandwidth 是大 B,IB 端口速率是小 b。
|
||||
@ -1,123 +0,0 @@
|
||||
# GPU_Test 双节点测试报告
|
||||
|
||||
- **测试日期:** 2026-05-24
|
||||
- **测试节点:** `aikubeworker0012 / 172.72.8.12`,`aikubeworker0016 / 172.72.8.16`
|
||||
- **节点配置:** 每节点 8 张 NVIDIA H100 80GB HBM3 GPU
|
||||
- **测试范围:** 单机算力、单机 8 卡通信、多机 2x8 GPU 通信
|
||||
- **网络形态:** 当前参与 NCCL 的计算网络为 4 条 400Gbps IB rail
|
||||
|
||||
## 结论摘要
|
||||
|
||||
| 项目 | 结果摘要 |
|
||||
|---|---|
|
||||
| GPU 识别 | 两台节点均识别 8 张 H100 80GB HBM3 GPU |
|
||||
| 单机 FP8 GEMM | 两台节点 direct cuBLASLt FP8 GEMM 均超过 1600 TFLOPS |
|
||||
| 单机 8 卡 NCCL | 两台节点单机 8 卡 NCCL 集合通信均可正常完成,主要大包通信带宽稳定 |
|
||||
| 多机 2x8 NCCL | 两节点 16 GPU NCCL 正确性通过,所有测试 `Wrong=0` / return code `0` |
|
||||
| 多机网络口径 | 当前为 4x400Gbps IB rail 环境,结果按该硬件形态解释 |
|
||||
|
||||
## 测试环境
|
||||
|
||||
| Host | GPU | Driver | CUDA | GPU 数量 |
|
||||
|---|---|---|---|---:|
|
||||
| `aikubeworker0012` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
|
||||
| `aikubeworker0016` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
|
||||
|
||||
## 单机算力测试
|
||||
|
||||
### FP8 GEMM 硬件路径验证
|
||||
|
||||
本项使用 direct cuBLASLt FP8 GEMM benchmark,绕过 PyTorch eager 调度路径,直接验证 GPU FP8 Tensor Core 与 cuBLASLt GEMM 能力。
|
||||
|
||||
| 参数 | 配置 |
|
||||
|---|---|
|
||||
| GEMM shape | `8192 x 8192 x 8192` |
|
||||
| 输入类型 | FP8 E4M3 |
|
||||
| 输出类型 | BF16 |
|
||||
| 累加类型 | FP32 compute |
|
||||
| Layout | TN |
|
||||
| Scale | `scale_a = 1.0`,`scale_b = 1.0` |
|
||||
| fast accumulation | enabled |
|
||||
| 测试 GPU | 每节点 8 张 GPU 逐张测试 |
|
||||
|
||||
| Host | Mean FP8 TFLOPS | Min | Max | Spread |
|
||||
|---|---:|---:|---:|---:|
|
||||
| `aikubeworker0012` | 1608.6 | 1599.0 | 1615.6 | 1.03% |
|
||||
| `aikubeworker0016` | 1613.7 | 1602.3 | 1630.3 | 1.74% |
|
||||
|
||||
| Host | GPU0 | GPU1 | GPU2 | GPU3 | GPU4 | GPU5 | GPU6 | GPU7 |
|
||||
|---|---:|---:|---:|---:|---:|---:|---:|---:|
|
||||
| `aikubeworker0012` | 1615.6 | 1611.0 | 1599.0 | 1607.1 | 1614.0 | 1604.4 | 1608.4 | 1609.1 |
|
||||
| `aikubeworker0016` | 1602.3 | 1604.0 | 1616.9 | 1610.6 | 1620.5 | 1630.3 | 1605.1 | 1620.2 |
|
||||
|
||||
**说明:** PyTorch `_scaled_mm` eager benchmark 结果约为 1170-1180 TFLOPS,该结果反映 PyTorch 软件路径与调度开销,不作为本报告的硬件算力结论。
|
||||
|
||||
## 单机 8 卡 NCCL 通信测试
|
||||
|
||||
本项在单个节点内使用 8 张 GPU 进行 NCCL 集合通信测试,结果单位为 `GB/s`,即 Bytes/s。
|
||||
|
||||
| Operation | `aikubeworker0012` Bus BW | `aikubeworker0016` Bus BW |
|
||||
|---|---:|---:|
|
||||
| allreduce | 472.3 GB/s | 472.4 GB/s |
|
||||
| alltoall | 343.3 GB/s | 344.3 GB/s |
|
||||
| broadcast | 364.1 GB/s | 363.6 GB/s |
|
||||
| reducescatter | 352.8 GB/s | 353.1 GB/s |
|
||||
| allgather | 366.4 GB/s | 366.4 GB/s |
|
||||
| sendrecv | 369.0 GB/s | 368.9 GB/s |
|
||||
|
||||
**说明:** 单机 8 卡通信主要依赖节点内 GPU 互联与 NCCL collective 实现。两台节点的同类 operation 结果接近,节点间差异较小。
|
||||
|
||||
## 多机 2x8 NCCL 通信测试
|
||||
|
||||
本项使用两台节点,每台 8 张 GPU,共 16 张 GPU 进行跨节点 NCCL 集合通信测试。
|
||||
|
||||
### 网络环境
|
||||
|
||||
| 项目 | 配置 |
|
||||
|---|---|
|
||||
| Host A | `aikubeworker0012 / 172.72.8.12` |
|
||||
| Host B | `aikubeworker0016 / 172.72.8.16` |
|
||||
| 拓扑 | 2 nodes x 8 GPUs |
|
||||
| NCCL network | IB |
|
||||
| GPU Direct RDMA | ENABLED |
|
||||
| Active rails | `mlx5_0, mlx5_1, mlx5_6, mlx5_7` |
|
||||
| Rail 速率 | 4 条 `400 Gb/sec (4X NDR)` ACTIVE |
|
||||
|
||||
### 跨节点 NCCL 结果
|
||||
|
||||
| Operation | Peak Bus BW | Avg Bus BW | Correctness |
|
||||
|---|---:|---:|---|
|
||||
| allreduce | 354.27 GB/s | 354.45 GB/s | PASS |
|
||||
| alltoall | 37.00 GB/s | 37.14 GB/s | PASS |
|
||||
| broadcast | 191.65 GB/s | 190.25 GB/s | PASS |
|
||||
| reducescatter | 192.75 GB/s | 192.74 GB/s | PASS |
|
||||
| allgather | 192.14 GB/s | 192.47 GB/s | PASS |
|
||||
| sendrecv | 26.98 GB/s | 26.97 GB/s | PASS |
|
||||
|
||||
**正确性:** 本轮多机 NCCL 测试 return code 为 `0`,`Wrong=0`,未发现数据正确性错误。
|
||||
|
||||
## 单位说明
|
||||
|
||||
| 写法 | 含义 | 说明 |
|
||||
|---|---|---|
|
||||
| `GB/s` | Gigabytes per second | 大 B,字节每秒,NCCL bus bandwidth 使用此单位 |
|
||||
| `Gbps` / `Gb/s` | Gigabits per second | 小 b,比特每秒,IB 端口速率通常使用此单位 |
|
||||
|
||||
换算关系:
|
||||
|
||||
```text
|
||||
1 Byte = 8 bits
|
||||
400 Gb/s = 50 GB/s
|
||||
4 x 400 Gb/s = 1600 Gb/s = 200 GB/s 物理链路字节带宽
|
||||
```
|
||||
|
||||
NCCL 的 `busbw` 是 collective 通信的逻辑折算带宽,不等同于单条物理链路的线速。
|
||||
|
||||
## 结果说明
|
||||
|
||||
1. 两台节点 GPU 识别正常,均为 8 张 H100 80GB HBM3。
|
||||
2. direct cuBLASLt FP8 GEMM 显示两台节点单卡 FP8 算力均超过 1600 TFLOPS,GPU FP8 硬件计算路径正常。
|
||||
3. 单机 8 卡 NCCL 通信在两台节点上结果接近,未观察到明显节点间异常差异。
|
||||
4. 多机 2x8 NCCL 正确性通过,跨节点通信功能正常。
|
||||
5. 当前多机通信结果应按 4x400Gbps IB rail 环境解释;若后续需要对齐 8x400Gbps 环境,应先确认 rail 数量、NCCL net plugin / SHARP、交换网络策略等配置一致。
|
||||
|
||||
@ -1,102 +0,0 @@
|
||||
@page {
|
||||
size: A4 landscape;
|
||||
margin: 13mm;
|
||||
}
|
||||
|
||||
body {
|
||||
color: #111827;
|
||||
font-family: "PingFang SC", "Heiti SC", "Arial Unicode MS", sans-serif;
|
||||
font-size: 11px;
|
||||
line-height: 1.45;
|
||||
}
|
||||
|
||||
h1 {
|
||||
color: #0f172a;
|
||||
font-size: 24px;
|
||||
margin: 0 0 14px;
|
||||
}
|
||||
|
||||
h2 {
|
||||
border-bottom: 1px solid #cbd5e1;
|
||||
color: #0f172a;
|
||||
font-size: 17px;
|
||||
margin: 24px 0 10px;
|
||||
padding-bottom: 4px;
|
||||
}
|
||||
|
||||
h3 {
|
||||
color: #1f2937;
|
||||
font-size: 13px;
|
||||
margin: 16px 0 8px;
|
||||
}
|
||||
|
||||
p {
|
||||
margin: 7px 0;
|
||||
}
|
||||
|
||||
code {
|
||||
background: #f1f5f9;
|
||||
border-radius: 3px;
|
||||
color: #0f172a;
|
||||
font-family: Menlo, Consolas, monospace;
|
||||
font-size: 10px;
|
||||
padding: 1px 3px;
|
||||
}
|
||||
|
||||
pre {
|
||||
background: #f8fafc;
|
||||
border: 1px solid #e2e8f0;
|
||||
border-radius: 4px;
|
||||
padding: 8px;
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
|
||||
table {
|
||||
border-collapse: collapse;
|
||||
margin: 8px 0 14px;
|
||||
page-break-inside: auto;
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
thead {
|
||||
display: table-header-group;
|
||||
}
|
||||
|
||||
tr {
|
||||
page-break-inside: avoid;
|
||||
}
|
||||
|
||||
th,
|
||||
td {
|
||||
border: 1px solid #cbd5e1;
|
||||
padding: 5px 6px;
|
||||
text-align: left;
|
||||
vertical-align: middle;
|
||||
word-break: break-word;
|
||||
}
|
||||
|
||||
th {
|
||||
background: #e2e8f0;
|
||||
color: #0f172a;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
tbody tr:nth-child(even) td {
|
||||
background: #f8fafc;
|
||||
}
|
||||
|
||||
a {
|
||||
color: #2563eb;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
ul,
|
||||
ol {
|
||||
margin: 6px 0 10px 20px;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
li {
|
||||
margin: 3px 0;
|
||||
}
|
||||
|
||||
@ -1,291 +0,0 @@
|
||||
#include <cublasLt.h>
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp8.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define CHECK_CUDA(call) \
|
||||
do { \
|
||||
cudaError_t status = (call); \
|
||||
if (status != cudaSuccess) { \
|
||||
std::fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \
|
||||
cudaGetErrorString(status)); \
|
||||
std::exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define CHECK_CUBLAS(call) \
|
||||
do { \
|
||||
cublasStatus_t status = (call); \
|
||||
if (status != CUBLAS_STATUS_SUCCESS) { \
|
||||
std::fprintf(stderr, "cuBLASLt error %s:%d: status=%d\n", __FILE__, \
|
||||
__LINE__, static_cast<int>(status)); \
|
||||
std::exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
__global__ void fill_fp8(__nv_fp8_e4m3 *ptr, size_t count, float value) {
|
||||
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
for (size_t i = tid; i < count; i += stride) {
|
||||
ptr[i] = __nv_fp8_e4m3(value);
|
||||
}
|
||||
}
|
||||
|
||||
struct Args {
|
||||
int matrix_size = 8192;
|
||||
int warmup = 20;
|
||||
int iterations = 200;
|
||||
int first_gpu = 0;
|
||||
int gpu_count = -1;
|
||||
size_t workspace_mb = 256;
|
||||
int fast_accum = 1;
|
||||
};
|
||||
|
||||
static Args parse_args(int argc, char **argv) {
|
||||
Args args;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
auto need = [&](const char *name) {
|
||||
if (i + 1 >= argc) {
|
||||
std::fprintf(stderr, "Missing value for %s\n", name);
|
||||
std::exit(2);
|
||||
}
|
||||
return argv[++i];
|
||||
};
|
||||
if (!std::strcmp(argv[i], "--matrix-size")) {
|
||||
args.matrix_size = std::atoi(need(argv[i]));
|
||||
} else if (!std::strcmp(argv[i], "--warmup")) {
|
||||
args.warmup = std::atoi(need(argv[i]));
|
||||
} else if (!std::strcmp(argv[i], "--iterations")) {
|
||||
args.iterations = std::atoi(need(argv[i]));
|
||||
} else if (!std::strcmp(argv[i], "--first-gpu")) {
|
||||
args.first_gpu = std::atoi(need(argv[i]));
|
||||
} else if (!std::strcmp(argv[i], "--gpu-count")) {
|
||||
args.gpu_count = std::atoi(need(argv[i]));
|
||||
} else if (!std::strcmp(argv[i], "--workspace-mb")) {
|
||||
args.workspace_mb = static_cast<size_t>(std::atoll(need(argv[i])));
|
||||
} else if (!std::strcmp(argv[i], "--fast-accum")) {
|
||||
args.fast_accum = std::atoi(need(argv[i]));
|
||||
} else if (!std::strcmp(argv[i], "--help") || !std::strcmp(argv[i], "-h")) {
|
||||
std::puts("Usage: cublaslt_fp8_gemm_bench [--matrix-size N] [--warmup N] "
|
||||
"[--iterations N] [--first-gpu N] [--gpu-count N] "
|
||||
"[--workspace-mb N] [--fast-accum 0|1]");
|
||||
std::exit(0);
|
||||
} else {
|
||||
std::fprintf(stderr, "Unknown argument: %s\n", argv[i]);
|
||||
std::exit(2);
|
||||
}
|
||||
}
|
||||
return args;
|
||||
}
|
||||
|
||||
static double run_one_gpu(int gpu, const Args &args) {
|
||||
CHECK_CUDA(cudaSetDevice(gpu));
|
||||
|
||||
const int64_t m = args.matrix_size;
|
||||
const int64_t n = args.matrix_size;
|
||||
const int64_t k = args.matrix_size;
|
||||
const size_t a_elems = static_cast<size_t>(m) * k;
|
||||
const size_t b_elems = static_cast<size_t>(k) * n;
|
||||
const size_t d_elems = static_cast<size_t>(m) * n;
|
||||
|
||||
__nv_fp8_e4m3 *d_a = nullptr;
|
||||
__nv_fp8_e4m3 *d_b = nullptr;
|
||||
__nv_bfloat16 *d_d = nullptr;
|
||||
void *workspace = nullptr;
|
||||
float *d_scale_a = nullptr;
|
||||
float *d_scale_b = nullptr;
|
||||
const float scale = 1.0f;
|
||||
const size_t workspace_bytes = args.workspace_mb * 1024ULL * 1024ULL;
|
||||
|
||||
CHECK_CUDA(cudaMalloc(&d_a, a_elems * sizeof(__nv_fp8_e4m3)));
|
||||
CHECK_CUDA(cudaMalloc(&d_b, b_elems * sizeof(__nv_fp8_e4m3)));
|
||||
CHECK_CUDA(cudaMalloc(&d_d, d_elems * sizeof(__nv_bfloat16)));
|
||||
CHECK_CUDA(cudaMalloc(&workspace, workspace_bytes));
|
||||
CHECK_CUDA(cudaMalloc(&d_scale_a, sizeof(float)));
|
||||
CHECK_CUDA(cudaMalloc(&d_scale_b, sizeof(float)));
|
||||
CHECK_CUDA(cudaMemcpy(d_scale_a, &scale, sizeof(scale), cudaMemcpyHostToDevice));
|
||||
CHECK_CUDA(cudaMemcpy(d_scale_b, &scale, sizeof(scale), cudaMemcpyHostToDevice));
|
||||
|
||||
const int threads = 256;
|
||||
const int blocks = 4096;
|
||||
fill_fp8<<<blocks, threads>>>(d_a, a_elems, 0.01f);
|
||||
fill_fp8<<<blocks, threads>>>(d_b, b_elems, 0.01f);
|
||||
CHECK_CUDA(cudaMemset(d_d, 0, d_elems * sizeof(__nv_bfloat16)));
|
||||
CHECK_CUDA(cudaGetLastError());
|
||||
CHECK_CUDA(cudaDeviceSynchronize());
|
||||
|
||||
cublasLtHandle_t lt;
|
||||
cublasLtMatmulDesc_t op_desc;
|
||||
cublasLtMatrixLayout_t a_desc, b_desc, d_desc;
|
||||
cublasLtMatmulPreference_t preference;
|
||||
CHECK_CUBLAS(cublasLtCreate(<));
|
||||
CHECK_CUBLAS(cublasLtMatmulDescCreate(&op_desc, CUBLAS_COMPUTE_32F, CUDA_R_32F));
|
||||
|
||||
// cuBLASLt FP8 kernels require TN format: A is transposed, B is non-transposed.
|
||||
// With square GEMMs this keeps the benchmark FLOP count identical to the PDF
|
||||
// acceptance shape while satisfying the library's FP8 kernel constraints.
|
||||
cublasOperation_t transa = CUBLAS_OP_T;
|
||||
cublasOperation_t transb = CUBLAS_OP_N;
|
||||
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
||||
op_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)));
|
||||
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
||||
op_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb)));
|
||||
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
||||
op_desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &d_scale_a,
|
||||
sizeof(d_scale_a)));
|
||||
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
||||
op_desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &d_scale_b,
|
||||
sizeof(d_scale_b)));
|
||||
int8_t fast_accum = args.fast_accum ? 1 : 0;
|
||||
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
||||
op_desc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fast_accum,
|
||||
sizeof(fast_accum)));
|
||||
|
||||
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&a_desc, CUDA_R_8F_E4M3, k, m, k));
|
||||
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&b_desc, CUDA_R_8F_E4M3, k, n, k));
|
||||
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&d_desc, CUDA_R_16BF, m, n, m));
|
||||
|
||||
CHECK_CUBLAS(cublasLtMatmulPreferenceCreate(&preference));
|
||||
CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute(
|
||||
preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_bytes,
|
||||
sizeof(workspace_bytes)));
|
||||
|
||||
cublasLtMatmulHeuristicResult_t heuristic;
|
||||
int returned = 0;
|
||||
CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic(
|
||||
lt, op_desc, a_desc, b_desc, d_desc, d_desc, preference, 1, &heuristic,
|
||||
&returned));
|
||||
if (returned == 0) {
|
||||
std::fprintf(stderr, "No cuBLASLt heuristic returned for GPU %d\n", gpu);
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
auto get_algo_attr_i32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
|
||||
int32_t value = -1;
|
||||
size_t written = 0;
|
||||
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
|
||||
&heuristic.algo, attr, &value, sizeof(value), &written));
|
||||
return static_cast<int>(value);
|
||||
};
|
||||
auto get_algo_attr_u32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
|
||||
uint32_t value = 0;
|
||||
size_t written = 0;
|
||||
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
|
||||
&heuristic.algo, attr, &value, sizeof(value), &written));
|
||||
return static_cast<int>(value);
|
||||
};
|
||||
auto get_algo_attr_u16 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
|
||||
uint16_t value = 0;
|
||||
size_t written = 0;
|
||||
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
|
||||
&heuristic.algo, attr, &value, sizeof(value), &written));
|
||||
return static_cast<int>(value);
|
||||
};
|
||||
const int algo_id = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_ID);
|
||||
const int tile_id = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_TILE_ID);
|
||||
const int splitk = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_SPLITK_NUM);
|
||||
const int stages = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_STAGES_ID);
|
||||
const int inner_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID);
|
||||
const int cluster_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID);
|
||||
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
auto matmul = [&]() {
|
||||
CHECK_CUBLAS(cublasLtMatmul(lt, op_desc, &alpha, d_a, a_desc, d_b, b_desc,
|
||||
&beta, d_d, d_desc, d_d, d_desc,
|
||||
&heuristic.algo, workspace, workspace_bytes, 0));
|
||||
};
|
||||
|
||||
for (int i = 0; i < args.warmup; ++i) {
|
||||
matmul();
|
||||
}
|
||||
CHECK_CUDA(cudaDeviceSynchronize());
|
||||
|
||||
cudaEvent_t start, stop;
|
||||
CHECK_CUDA(cudaEventCreate(&start));
|
||||
CHECK_CUDA(cudaEventCreate(&stop));
|
||||
CHECK_CUDA(cudaEventRecord(start));
|
||||
for (int i = 0; i < args.iterations; ++i) {
|
||||
matmul();
|
||||
}
|
||||
CHECK_CUDA(cudaEventRecord(stop));
|
||||
CHECK_CUDA(cudaEventSynchronize(stop));
|
||||
float elapsed_ms = 0.0f;
|
||||
CHECK_CUDA(cudaEventElapsedTime(&elapsed_ms, start, stop));
|
||||
const double flops =
|
||||
2.0 * static_cast<double>(m) * static_cast<double>(n) *
|
||||
static_cast<double>(k) * static_cast<double>(args.iterations);
|
||||
const double tflops = flops / (static_cast<double>(elapsed_ms) / 1000.0) / 1e12;
|
||||
std::printf(
|
||||
" {\"index\": %d, \"fp8_tflops\": %.1f, \"algo_id\": %d, "
|
||||
"\"tile_id\": %d, \"splitk\": %d, \"stages_id\": %d, "
|
||||
"\"inner_shape_id\": %d, \"cluster_shape_id\": %d}%s\n",
|
||||
gpu, tflops, algo_id, tile_id, splitk, stages, inner_shape, cluster_shape,
|
||||
(gpu + 1 == args.first_gpu + args.gpu_count) ? "" : ",");
|
||||
std::fflush(stdout);
|
||||
|
||||
CHECK_CUDA(cudaEventDestroy(start));
|
||||
CHECK_CUDA(cudaEventDestroy(stop));
|
||||
CHECK_CUBLAS(cublasLtMatmulPreferenceDestroy(preference));
|
||||
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(a_desc));
|
||||
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(b_desc));
|
||||
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(d_desc));
|
||||
CHECK_CUBLAS(cublasLtMatmulDescDestroy(op_desc));
|
||||
CHECK_CUBLAS(cublasLtDestroy(lt));
|
||||
CHECK_CUDA(cudaFree(d_a));
|
||||
CHECK_CUDA(cudaFree(d_b));
|
||||
CHECK_CUDA(cudaFree(d_d));
|
||||
CHECK_CUDA(cudaFree(workspace));
|
||||
CHECK_CUDA(cudaFree(d_scale_a));
|
||||
CHECK_CUDA(cudaFree(d_scale_b));
|
||||
CHECK_CUDA(cudaDeviceSynchronize());
|
||||
|
||||
return tflops;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
Args args = parse_args(argc, argv);
|
||||
int device_count = 0;
|
||||
CHECK_CUDA(cudaGetDeviceCount(&device_count));
|
||||
if (args.gpu_count < 0) {
|
||||
args.gpu_count = device_count - args.first_gpu;
|
||||
}
|
||||
if (args.first_gpu < 0 || args.first_gpu + args.gpu_count > device_count) {
|
||||
std::fprintf(stderr, "Invalid GPU range first=%d count=%d device_count=%d\n",
|
||||
args.first_gpu, args.gpu_count, device_count);
|
||||
return 2;
|
||||
}
|
||||
|
||||
std::vector<double> values;
|
||||
std::printf("{\n");
|
||||
std::printf(" \"source\": \"cuBLASLt\",\n");
|
||||
std::printf(" \"dtype\": \"fp8_e4m3_inputs_bf16_output_fp32_accum\",\n");
|
||||
std::printf(" \"matrix_size\": %d,\n", args.matrix_size);
|
||||
std::printf(" \"warmup\": %d,\n", args.warmup);
|
||||
std::printf(" \"iterations\": %d,\n", args.iterations);
|
||||
std::printf(" \"fast_accum\": %d,\n", args.fast_accum ? 1 : 0);
|
||||
std::printf(" \"per_gpu\": [\n");
|
||||
for (int i = 0; i < args.gpu_count; ++i) {
|
||||
int gpu = args.first_gpu + i;
|
||||
double tflops = run_one_gpu(gpu, args);
|
||||
values.push_back(tflops);
|
||||
}
|
||||
double mean = std::accumulate(values.begin(), values.end(), 0.0) / values.size();
|
||||
auto minmax = std::minmax_element(values.begin(), values.end());
|
||||
double spread = ((*minmax.second - *minmax.first) / mean) * 100.0;
|
||||
std::printf(" ],\n");
|
||||
std::printf(" \"mean_tflops\": %.1f,\n", mean);
|
||||
std::printf(" \"min_tflops\": %.1f,\n", *minmax.first);
|
||||
std::printf(" \"max_tflops\": %.1f,\n", *minmax.second);
|
||||
std::printf(" \"spread_pct\": %.2f\n", spread);
|
||||
std::printf("}\n");
|
||||
return mean >= 1400.0 ? 0 : 1;
|
||||
}
|
||||
@ -1,277 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compare FP8 GEMM paths used for H100/H200 acceptance debugging.
|
||||
|
||||
Paths:
|
||||
A. torch._scaled_mm eager, default accumulation
|
||||
B. torch._scaled_mm eager, use_fast_accum=True
|
||||
C. CUDA Graph replay of torch._scaled_mm(out=..., use_fast_accum=True)
|
||||
D. Transformer Engine Linear under fp8_autocast, when installed
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from typing import Any, Callable
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def tflops_from_ms(matrix_size: int, iterations: int, elapsed_ms: float) -> float:
|
||||
flops = 2.0 * matrix_size * matrix_size * matrix_size * iterations
|
||||
return flops / (elapsed_ms / 1000.0) / 1e12
|
||||
|
||||
|
||||
def cuda_event_bench(
|
||||
name: str,
|
||||
matrix_size: int,
|
||||
iterations: int,
|
||||
warmup: int,
|
||||
func: Callable[[int], Any],
|
||||
) -> dict[str, Any]:
|
||||
for i in range(warmup):
|
||||
func(i)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
wall_start = time.perf_counter()
|
||||
start.record()
|
||||
for i in range(iterations):
|
||||
func(i)
|
||||
end.record()
|
||||
torch.cuda.synchronize()
|
||||
wall_elapsed = time.perf_counter() - wall_start
|
||||
elapsed_ms = start.elapsed_time(end)
|
||||
return {
|
||||
"name": name,
|
||||
"status": "ok",
|
||||
"matrix_size": matrix_size,
|
||||
"iterations": iterations,
|
||||
"warmup": warmup,
|
||||
"event_ms_total": round(elapsed_ms, 3),
|
||||
"event_us_per_iter": round(elapsed_ms * 1000.0 / iterations, 3),
|
||||
"wall_ms_total": round(wall_elapsed * 1000.0, 3),
|
||||
"tflops": round(tflops_from_ms(matrix_size, iterations, elapsed_ms), 1),
|
||||
}
|
||||
|
||||
|
||||
def make_fp8_inputs(matrix_size: int, pools: int, device: str) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
|
||||
a = [
|
||||
torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn)
|
||||
for _ in range(pools)
|
||||
]
|
||||
b = [
|
||||
torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn)
|
||||
for _ in range(pools)
|
||||
]
|
||||
torch.cuda.synchronize()
|
||||
return a, b
|
||||
|
||||
|
||||
def bench_scaled_mm(args: argparse.Namespace) -> list[dict[str, Any]]:
|
||||
device = f"cuda:{args.gpu_index}"
|
||||
torch.cuda.set_device(args.gpu_index)
|
||||
scale_a = torch.tensor(1.0, device=device)
|
||||
scale_b = torch.tensor(1.0, device=device)
|
||||
pools_a, pools_b = make_fp8_inputs(args.matrix_size, args.pools, device)
|
||||
results: list[dict[str, Any]] = []
|
||||
|
||||
def eager_default(i: int) -> torch.Tensor:
|
||||
idx = i % args.pools
|
||||
return torch._scaled_mm(
|
||||
pools_a[idx],
|
||||
pools_b[idx].T,
|
||||
scale_a=scale_a,
|
||||
scale_b=scale_b,
|
||||
out_dtype=torch.bfloat16,
|
||||
)
|
||||
|
||||
def eager_fast(i: int) -> torch.Tensor:
|
||||
idx = i % args.pools
|
||||
return torch._scaled_mm(
|
||||
pools_a[idx],
|
||||
pools_b[idx].T,
|
||||
scale_a=scale_a,
|
||||
scale_b=scale_b,
|
||||
out_dtype=torch.bfloat16,
|
||||
use_fast_accum=True,
|
||||
)
|
||||
|
||||
results.append(
|
||||
cuda_event_bench(
|
||||
"A_eager_scaled_mm_default",
|
||||
args.matrix_size,
|
||||
args.iterations,
|
||||
args.warmup,
|
||||
eager_default,
|
||||
)
|
||||
)
|
||||
results.append(
|
||||
cuda_event_bench(
|
||||
"B_eager_scaled_mm_fast_accum",
|
||||
args.matrix_size,
|
||||
args.iterations,
|
||||
args.warmup,
|
||||
eager_fast,
|
||||
)
|
||||
)
|
||||
|
||||
graph_out = torch.empty(
|
||||
(args.matrix_size, args.matrix_size),
|
||||
device=device,
|
||||
dtype=torch.bfloat16,
|
||||
)
|
||||
static_a = pools_a[0]
|
||||
static_b_t = pools_b[0].T
|
||||
|
||||
try:
|
||||
side_stream = torch.cuda.Stream()
|
||||
side_stream.wait_stream(torch.cuda.current_stream())
|
||||
with torch.cuda.stream(side_stream):
|
||||
for _ in range(max(3, args.warmup // 2)):
|
||||
torch._scaled_mm(
|
||||
static_a,
|
||||
static_b_t,
|
||||
scale_a=scale_a,
|
||||
scale_b=scale_b,
|
||||
out_dtype=torch.bfloat16,
|
||||
use_fast_accum=True,
|
||||
out=graph_out,
|
||||
)
|
||||
torch.cuda.current_stream().wait_stream(side_stream)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph):
|
||||
torch._scaled_mm(
|
||||
static_a,
|
||||
static_b_t,
|
||||
scale_a=scale_a,
|
||||
scale_b=scale_b,
|
||||
out_dtype=torch.bfloat16,
|
||||
use_fast_accum=True,
|
||||
out=graph_out,
|
||||
)
|
||||
|
||||
def graph_replay(_: int) -> None:
|
||||
graph.replay()
|
||||
|
||||
results.append(
|
||||
cuda_event_bench(
|
||||
"C_cuda_graph_scaled_mm_fast_accum",
|
||||
args.matrix_size,
|
||||
args.iterations,
|
||||
3,
|
||||
graph_replay,
|
||||
)
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
results.append(
|
||||
{
|
||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
||||
"status": "unavailable",
|
||||
"reason": f"{type(exc).__name__}: {exc}",
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def bench_transformer_engine(args: argparse.Namespace) -> dict[str, Any]:
|
||||
try:
|
||||
import transformer_engine.pytorch as te # type: ignore[import-not-found]
|
||||
from transformer_engine.common.recipe import DelayedScaling, Format # type: ignore[import-not-found]
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return {
|
||||
"name": "D_transformer_engine_fp8_linear",
|
||||
"status": "unavailable",
|
||||
"reason": f"{type(exc).__name__}: {exc}",
|
||||
}
|
||||
|
||||
device = f"cuda:{args.gpu_index}"
|
||||
x = torch.randn(args.matrix_size, args.matrix_size, device=device, dtype=torch.bfloat16)
|
||||
layer = te.Linear(
|
||||
args.matrix_size,
|
||||
args.matrix_size,
|
||||
bias=False,
|
||||
params_dtype=torch.bfloat16,
|
||||
device=device,
|
||||
)
|
||||
recipe = DelayedScaling(fp8_format=Format.HYBRID)
|
||||
|
||||
def run(_: int) -> torch.Tensor:
|
||||
with te.fp8_autocast(enabled=True, fp8_recipe=recipe):
|
||||
return layer(x)
|
||||
|
||||
try:
|
||||
result = cuda_event_bench(
|
||||
"D_transformer_engine_fp8_linear",
|
||||
args.matrix_size,
|
||||
args.iterations,
|
||||
args.warmup,
|
||||
run,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return {
|
||||
"name": "D_transformer_engine_fp8_linear",
|
||||
"status": "error",
|
||||
"reason": f"{type(exc).__name__}: {exc}",
|
||||
}
|
||||
result["note"] = "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
|
||||
return result
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--matrix-size", type=int, default=8192)
|
||||
parser.add_argument("--warmup", type=int, default=20)
|
||||
parser.add_argument("--iterations", type=int, default=100)
|
||||
parser.add_argument("--gpu-index", type=int, default=0)
|
||||
parser.add_argument("--pools", type=int, default=4)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not torch.cuda.is_available():
|
||||
print(json.dumps({"error": "cuda unavailable"}, indent=2))
|
||||
return 1
|
||||
if not hasattr(torch, "_scaled_mm") or not hasattr(torch, "float8_e4m3fn"):
|
||||
print(json.dumps({"error": "torch FP8 _scaled_mm unavailable"}, indent=2))
|
||||
return 1
|
||||
|
||||
torch.cuda.set_device(args.gpu_index)
|
||||
props = torch.cuda.get_device_properties(args.gpu_index)
|
||||
payload = {
|
||||
"source": "pytorch_fp8_path_bench",
|
||||
"torch": torch.__version__,
|
||||
"cuda": torch.version.cuda,
|
||||
"gpu_index": args.gpu_index,
|
||||
"gpu_name": props.name,
|
||||
"matrix_size": args.matrix_size,
|
||||
"warmup": args.warmup,
|
||||
"iterations": args.iterations,
|
||||
"results": [],
|
||||
}
|
||||
try:
|
||||
payload["results"].extend(bench_scaled_mm(args))
|
||||
payload["results"].append(bench_transformer_engine(args))
|
||||
except torch.cuda.OutOfMemoryError as exc:
|
||||
payload["error"] = f"CUDA OOM: {exc}"
|
||||
print(json.dumps(payload, indent=2))
|
||||
return 1
|
||||
|
||||
ok_values = [r["tflops"] for r in payload["results"] if r.get("status") == "ok"]
|
||||
if ok_values:
|
||||
payload["summary"] = {
|
||||
"max_tflops": round(max(ok_values), 1),
|
||||
"min_tflops": round(min(ok_values), 1),
|
||||
"mean_tflops": round(statistics.mean(ok_values), 1),
|
||||
}
|
||||
print(json.dumps(payload, indent=2))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@ -1,45 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
||||
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
|
||||
|
||||
CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
|
||||
NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
|
||||
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
|
||||
MATRIX_SIZE="${MATRIX_SIZE:-8192}"
|
||||
WARMUP="${WARMUP:-20}"
|
||||
ITERATIONS="${ITERATIONS:-200}"
|
||||
GPU_COUNT="${GPU_COUNT:-8}"
|
||||
FIRST_GPU="${FIRST_GPU:-0}"
|
||||
WORKSPACE_MB="${WORKSPACE_MB:-256}"
|
||||
|
||||
if [[ ! -x "$NVCC" ]]; then
|
||||
echo "nvcc not found: $NVCC" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "$OUT_DIR" "$PROJECT_DIR/build"
|
||||
HOST="$(hostname 2>/dev/null || echo unknown)"
|
||||
TS="$(date +%Y%m%d_%H%M%S)"
|
||||
BIN="$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
|
||||
REPORT="$OUT_DIR/cublaslt_fp8_gemm_${HOST}_${TS}.json"
|
||||
|
||||
"$NVCC" -O3 -std=c++17 -arch=sm_90 \
|
||||
"$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
|
||||
-lcublasLt -lcublas -o "$BIN"
|
||||
|
||||
set +e
|
||||
"$BIN" \
|
||||
--matrix-size "$MATRIX_SIZE" \
|
||||
--warmup "$WARMUP" \
|
||||
--iterations "$ITERATIONS" \
|
||||
--first-gpu "$FIRST_GPU" \
|
||||
--gpu-count "$GPU_COUNT" \
|
||||
--workspace-mb "$WORKSPACE_MB" \
|
||||
| tee "$REPORT"
|
||||
status=${PIPESTATUS[0]}
|
||||
set -e
|
||||
|
||||
echo "Report written to: $REPORT"
|
||||
exit "$status"
|
||||
@ -1,93 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
||||
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
|
||||
|
||||
PYTHON="${PYTHON:-/root/gpu-test-venv/bin/python}"
|
||||
CUDA_HOME="${CUDA_HOME:-/usr/local/cuda-12.4}"
|
||||
NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
|
||||
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
|
||||
MATRIX_SIZE="${MATRIX_SIZE:-8192}"
|
||||
WARMUP="${WARMUP:-20}"
|
||||
ITERATIONS="${ITERATIONS:-100}"
|
||||
GPU_INDEX="${GPU_INDEX:-0}"
|
||||
WORKSPACE_MB="${WORKSPACE_MB:-256}"
|
||||
VENV_SITE_PACKAGES="$("$PYTHON" - <<'PY'
|
||||
import site
|
||||
print(site.getsitepackages()[0])
|
||||
PY
|
||||
)"
|
||||
export LD_LIBRARY_PATH="$VENV_SITE_PACKAGES/nvidia/cudnn/lib:$VENV_SITE_PACKAGES/nvidia/nccl/lib:${LD_LIBRARY_PATH:-}"
|
||||
|
||||
mkdir -p "$PROJECT_DIR/build" "$OUT_DIR"
|
||||
|
||||
HOST="$(hostname 2>/dev/null || echo unknown)"
|
||||
TS="$(date +%Y%m%d_%H%M%S)"
|
||||
PY_REPORT="$OUT_DIR/fp8_paths_pytorch_${HOST}_${TS}.json"
|
||||
CUBLAS_REPORT="$OUT_DIR/fp8_paths_cublaslt_${HOST}_${TS}.json"
|
||||
COMBINED_REPORT="$OUT_DIR/fp8_paths_combined_${HOST}_${TS}.json"
|
||||
|
||||
"$PYTHON" "$PROJECT_DIR/scripts/pytorch_fp8_path_bench.py" \
|
||||
--matrix-size "$MATRIX_SIZE" \
|
||||
--warmup "$WARMUP" \
|
||||
--iterations "$ITERATIONS" \
|
||||
--gpu-index "$GPU_INDEX" | tee "$PY_REPORT"
|
||||
|
||||
"$NVCC" -O3 -std=c++17 -arch=sm_90 \
|
||||
"$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
|
||||
-lcublasLt -lcublas -o "$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
|
||||
|
||||
"$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" \
|
||||
--matrix-size "$MATRIX_SIZE" \
|
||||
--warmup "$WARMUP" \
|
||||
--iterations "$ITERATIONS" \
|
||||
--first-gpu "$GPU_INDEX" \
|
||||
--gpu-count 1 \
|
||||
--workspace-mb "$WORKSPACE_MB" \
|
||||
--fast-accum 1 | tee "$CUBLAS_REPORT"
|
||||
|
||||
"$PYTHON" - "$PY_REPORT" "$CUBLAS_REPORT" "$COMBINED_REPORT" <<'PY'
|
||||
import json
|
||||
import pathlib
|
||||
import sys
|
||||
|
||||
py_report = pathlib.Path(sys.argv[1])
|
||||
cublas_report = pathlib.Path(sys.argv[2])
|
||||
combined_report = pathlib.Path(sys.argv[3])
|
||||
|
||||
with py_report.open() as f:
|
||||
py_payload = json.load(f)
|
||||
with cublas_report.open() as f:
|
||||
cublas_payload = json.load(f)
|
||||
|
||||
combined = {
|
||||
"source": "fp8_path_comparison",
|
||||
"host": cublas_payload.get("host"),
|
||||
"matrix_size": py_payload.get("matrix_size"),
|
||||
"gpu_index": py_payload.get("gpu_index"),
|
||||
"pytorch": py_payload,
|
||||
"cublaslt": cublas_payload,
|
||||
"results": [],
|
||||
}
|
||||
combined["results"].extend(py_payload.get("results", []))
|
||||
per_gpu = cublas_payload.get("per_gpu", [])
|
||||
if per_gpu:
|
||||
row = dict(per_gpu[0])
|
||||
row.update({
|
||||
"name": "E_direct_cublaslt_fast_accum",
|
||||
"status": "ok",
|
||||
"tflops": row.pop("fp8_tflops"),
|
||||
"matrix_size": cublas_payload.get("matrix_size"),
|
||||
"iterations": cublas_payload.get("iterations"),
|
||||
"warmup": cublas_payload.get("warmup"),
|
||||
"fast_accum": cublas_payload.get("fast_accum"),
|
||||
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager.",
|
||||
})
|
||||
combined["results"].append(row)
|
||||
|
||||
combined_report.write_text(json.dumps(combined, indent=2), encoding="utf-8")
|
||||
print(f"Combined report written to: {combined_report}")
|
||||
PY
|
||||
|
||||
echo "$COMBINED_REPORT"
|
||||
Loading…
x
Reference in New Issue
Block a user