Compare commits
37 Commits
4dddab27b3
...
211140e7f1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
211140e7f1 | ||
|
|
9b0e6e29df | ||
|
|
cfca4df62a | ||
|
|
6d72c67468 | ||
|
|
64f203a9be | ||
|
|
c48f9f915d | ||
|
|
0a772103f0 | ||
|
|
f5699bf85a | ||
|
|
59621d26a0 | ||
|
|
18cebd8e06 | ||
|
|
1a8cf6cbbb | ||
|
|
63c32fd75d | ||
|
|
4f38b3a2a0 | ||
|
|
8ff5021385 | ||
|
|
bdffd7e616 | ||
|
|
b6b1ccc2dc | ||
|
|
ec6b868d3f | ||
|
|
3a0e739991 | ||
|
|
b914cb7b4b | ||
|
|
adcdb36e05 | ||
|
|
0d63ea5e05 | ||
|
|
08e0d93a16 | ||
|
|
9a32645c9d | ||
|
|
71ac97a24e | ||
|
|
890e623be4 | ||
|
|
17441a4583 | ||
|
|
05294a66d8 | ||
|
|
6b302c8b09 | ||
|
|
cd1728a7c3 | ||
|
|
48a27472e2 | ||
|
|
49d358c0ca | ||
|
|
519de86553 | ||
|
|
3268f4bd2a | ||
|
|
b82ac54218 | ||
|
|
28fe55d5c7 | ||
|
|
ac91f1aeb5 | ||
|
|
2a51be1ba3 |
@ -1,4 +1,4 @@
|
|||||||
# GPU type: auto-detect or override to a100/a800/h100/h800/h200/h20/b200/b300
|
# GPU type: auto-detect or override to a100/a800/h100/h200/b200/b300
|
||||||
gpu_type: auto
|
gpu_type: auto
|
||||||
|
|
||||||
benchmark:
|
benchmark:
|
||||||
@ -14,25 +14,12 @@ benchmark:
|
|||||||
- fp16
|
- fp16
|
||||||
- bf16
|
- bf16
|
||||||
- fp8
|
- fp8
|
||||||
# MAMF-style shape sweep: measure each dtype at every shape below and keep the max
|
- fp64
|
||||||
# TFLOPS (the realistic achievable peak). A single fixed shape under-reports by
|
- int8
|
||||||
# ~7-12% and can't meet the MAMF-calibrated thresholds in gpu_specs.py.
|
matrix_size: 8192
|
||||||
# Each entry is either N (square N×N×N) or [M, N, K]. K-heavy non-square shapes
|
warmup: 50
|
||||||
# (e.g. 2048×2048×13312) hit the true Hopper MAMF — bf16 ~790 vs ~755 square.
|
iterations: 500
|
||||||
# Empty list => single matrix_size shape (legacy behaviour).
|
use_compile: true
|
||||||
sweep_sizes:
|
|
||||||
- 3584
|
|
||||||
- 4608
|
|
||||||
- 5376
|
|
||||||
- 8192
|
|
||||||
- 11520
|
|
||||||
- [2048, 2048, 13312]
|
|
||||||
- [2048, 2048, 16384]
|
|
||||||
matrix_size: 8192 # fallback shape when sweep_sizes is empty
|
|
||||||
warmup: 20
|
|
||||||
iterations: 80
|
|
||||||
# NOTE: torch.compile was dropped — on H100 eager cuBLAS beats Triton for plain
|
|
||||||
# GEMM, and compiling would re-autotune per shape and make the sweep very slow.
|
|
||||||
|
|
||||||
health:
|
health:
|
||||||
temp_warning: 75
|
temp_warning: 75
|
||||||
@ -44,9 +31,15 @@ nccl:
|
|||||||
test_allreduce: true
|
test_allreduce: true
|
||||||
test_alltoall: true
|
test_alltoall: true
|
||||||
test_broadcast: true
|
test_broadcast: true
|
||||||
test_reduce_scatter: false
|
test_reduce_scatter: true
|
||||||
test_allgather: false
|
test_allgather: true
|
||||||
test_sendrecv: false
|
test_sendrecv: true
|
||||||
|
message_sizes:
|
||||||
|
- 1M
|
||||||
|
- 256M
|
||||||
|
- 2G
|
||||||
|
repeats: 3
|
||||||
|
max_stddev_pct: 3
|
||||||
|
|
||||||
multinode_nccl:
|
multinode_nccl:
|
||||||
enabled: false
|
enabled: false
|
||||||
@ -98,44 +91,66 @@ multinode_nccl:
|
|||||||
alltoall: 75
|
alltoall: 75
|
||||||
|
|
||||||
stress:
|
stress:
|
||||||
duration_sec: 600 # 10 min — reaches thermal steady state, validates throttle/jitter beyond warmup
|
duration_sec: 1800
|
||||||
|
production_duration_sec: 1800
|
||||||
|
use_gpu_burn: false
|
||||||
use_doubles: false
|
use_doubles: false
|
||||||
use_tensor_cores: true
|
use_tensor_cores: true
|
||||||
memory_pct: 90
|
memory_pct: 90
|
||||||
gpus: all
|
gpus: all
|
||||||
|
dtype: bf16
|
||||||
|
matrix_size: 24576
|
||||||
|
telemetry_interval_sec: 1
|
||||||
|
warmup_sec: 60
|
||||||
|
min_steady_samples: 10
|
||||||
|
max_temp_c: 80
|
||||||
|
max_temp_delta_c: 5
|
||||||
|
min_power_watts: 630
|
||||||
|
max_tflops_jitter_pct: 5
|
||||||
|
require_tflops_jitter: true
|
||||||
|
|
||||||
rdma:
|
rdma:
|
||||||
min_bandwidth_gbps: 50
|
min_bandwidth_gbps: 47
|
||||||
max_latency_us: 10
|
min_port_rate_gbps: 400
|
||||||
|
max_latency_us: 3.5
|
||||||
|
max_write_latency_us: 2.0
|
||||||
|
max_read_latency_us: 3.5
|
||||||
ib_iterations: 1000
|
ib_iterations: 1000
|
||||||
msg_size: 65536
|
msg_size: 4194304
|
||||||
|
latency_msg_size: 8
|
||||||
ib_device: null
|
ib_device: null
|
||||||
ib_port: 1
|
ib_port: 1
|
||||||
# Cross-node (two-host) RDMA via perftest, orchestrated over SSH from the CLIENT
|
server_addr: null
|
||||||
# node. Replaces the old scripts/rdma_cross_node.sh. Run on the client; it starts
|
ibping_target: null
|
||||||
# ib_write_bw/ib_write_lat servers on `server` over SSH (passwordless required),
|
ibping_count: 5
|
||||||
# then drives the local client per device.
|
role: auto
|
||||||
cross_node:
|
pfc_ecn_counters: true
|
||||||
enabled: false # set true on the client node to run cross-node RDMA
|
|
||||||
server: null # peer ssh address, e.g. 172.72.8.12 (server node)
|
nvlink:
|
||||||
server_addr: null # OOB addr client connects to (default: = server)
|
expected_links_per_gpu: 18
|
||||||
ssh_user: root
|
expected_link_speed_gbps: 25
|
||||||
devices: [] # e.g. [mlx5_0, mlx5_1, mlx5_6, mlx5_7]; [] = auto-detect active IB
|
require_zero_errors: true
|
||||||
ib_port: 1
|
|
||||||
gid_index: null # -x <n> for RoCE; null for pure InfiniBand
|
dcgm:
|
||||||
msg_size: 1048576 # 1 MiB — large enough to reach NDR400 peak
|
diag_level: 3
|
||||||
iters: 5000
|
timeout_sec: 3600
|
||||||
base_oob_port: 18515 # per-device OOB port = base + device index
|
expected_num_gpus: 8
|
||||||
server_warmup_sec: 2.0
|
json_output: true
|
||||||
min_bandwidth_gbps: 350 # per-port PASS floor (NDR400 ≈ 0.9 × 400)
|
require_subtests: true
|
||||||
max_latency_us: 5
|
|
||||||
|
|
||||||
training:
|
training:
|
||||||
model: gpt2
|
model: synthetic_1.5b
|
||||||
batch_size: 8
|
batch_size: 8
|
||||||
seq_length: 2048
|
seq_length: 2048
|
||||||
num_steps: 50
|
num_steps: 50
|
||||||
|
warmup_steps: 5
|
||||||
dtype: bf16
|
dtype: bf16
|
||||||
|
mode: ddp
|
||||||
|
synthetic_params_b: 1.5
|
||||||
|
min_tokens_per_sec: 45000
|
||||||
|
max_step_jitter_pct: 3
|
||||||
|
max_peak_memory_gb: 70
|
||||||
|
require_distributed: true
|
||||||
|
|
||||||
report:
|
report:
|
||||||
output_dir: ./reports
|
output_dir: ./reports
|
||||||
|
|||||||
@ -312,31 +312,10 @@ class Benchmark:
|
|||||||
def run_compute_benchmark(self, dtypes: Optional[List[str]] = None) -> dict:
|
def run_compute_benchmark(self, dtypes: Optional[List[str]] = None) -> dict:
|
||||||
comp_cfg = self.bench_cfg.get("compute", {})
|
comp_cfg = self.bench_cfg.get("compute", {})
|
||||||
configured_dtypes = dtypes or comp_cfg.get("dtypes", ["fp32", "tf32", "fp16", "bf16", "fp8"])
|
configured_dtypes = dtypes or comp_cfg.get("dtypes", ["fp32", "tf32", "fp16", "bf16", "fp8"])
|
||||||
|
matrix_size = comp_cfg.get("matrix_size", 4096)
|
||||||
# MAMF-style shape sweep (à la stas00's mamf-finder): a single fixed matmul
|
warmup = comp_cfg.get("warmup", 10)
|
||||||
# shape under-reports the achievable peak by ~7-12% and therefore can't meet
|
iterations = comp_cfg.get("iterations", 100)
|
||||||
# the MAMF-calibrated PASS thresholds in gpu_specs.compute_pass_thresholds_tflops.
|
use_compile = comp_cfg.get("use_compile", False)
|
||||||
# So for each dtype we time several matmul shapes and keep the MAXIMUM TFLOPS
|
|
||||||
# (the realistic peak). matrix_size is the fallback when sweep_sizes is empty.
|
|
||||||
matrix_size = comp_cfg.get("matrix_size", 8192)
|
|
||||||
sweep_sizes = comp_cfg.get("sweep_sizes") or [matrix_size]
|
|
||||||
warmup = comp_cfg.get("warmup", 20)
|
|
||||||
iterations = comp_cfg.get("iterations", 80)
|
|
||||||
|
|
||||||
# Each sweep entry is either an int N (square N×N×N) or an [M, N, K] triple.
|
|
||||||
# Non-square / K-heavy shapes (e.g. 2048×2048×13312) reach the true MAMF peak
|
|
||||||
# on Hopper — square-only tops out ~5% lower — so the default set mixes both.
|
|
||||||
def _to_shape(entry):
|
|
||||||
if isinstance(entry, (list, tuple)):
|
|
||||||
if len(entry) == 3:
|
|
||||||
return tuple(int(x) for x in entry)
|
|
||||||
if len(entry) == 1:
|
|
||||||
n = int(entry[0])
|
|
||||||
return (n, n, n)
|
|
||||||
raise ValueError(f"sweep size {entry!r} must be an int or [M, N, K]")
|
|
||||||
n = int(entry)
|
|
||||||
return (n, n, n)
|
|
||||||
shapes = [_to_shape(e) for e in sweep_sizes]
|
|
||||||
|
|
||||||
if not TORCH_AVAILABLE:
|
if not TORCH_AVAILABLE:
|
||||||
self.console.print("[yellow]PyTorch not available - skipping compute benchmark[/yellow]")
|
self.console.print("[yellow]PyTorch not available - skipping compute benchmark[/yellow]")
|
||||||
@ -344,22 +323,37 @@ class Benchmark:
|
|||||||
|
|
||||||
gpu_count = torch.cuda.device_count()
|
gpu_count = torch.cuda.device_count()
|
||||||
self.console.print(f"[cyan]Compute Benchmark - {gpu_count} GPU(s)[/cyan]")
|
self.console.print(f"[cyan]Compute Benchmark - {gpu_count} GPU(s)[/cyan]")
|
||||||
if len(sweep_sizes) > 1:
|
|
||||||
self.console.print(
|
# torch.compile(max-autotune) benchmarks cuBLAS vs Triton kernels and picks
|
||||||
f"[cyan] MAMF shape sweep over {len(sweep_sizes)} sizes: "
|
# the fastest for this GPU/shape, typically improving efficiency by 8-15%.
|
||||||
f"{', '.join(str(s) for s in sweep_sizes)}[/cyan]"
|
# compile_warmup must be larger than warmup to absorb JIT + autotuning time.
|
||||||
)
|
mm_fn = torch.matmul
|
||||||
|
compile_warmup = warmup
|
||||||
|
if use_compile:
|
||||||
|
try:
|
||||||
|
_compiled = torch.compile(torch.matmul, mode="max-autotune")
|
||||||
|
# Trial call to trigger JIT and verify compilation succeeds before the dtype loop.
|
||||||
|
_t = torch.randn(64, 64, device="cuda", dtype=torch.float32)
|
||||||
|
_compiled(_t, _t)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
del _t
|
||||||
|
mm_fn = _compiled
|
||||||
|
compile_warmup = max(warmup, 50)
|
||||||
|
self.console.print("[cyan] torch.compile(max-autotune) enabled[/cyan]")
|
||||||
|
except Exception as e:
|
||||||
|
self.console.print(f"[yellow] torch.compile unavailable ({type(e).__name__}), using eager[/yellow]")
|
||||||
|
|
||||||
dtype_map = {
|
dtype_map = {
|
||||||
"fp32": (torch.float32, self.specs["fp32_tflops"]),
|
"fp32": (torch.float32, self.specs.get("fp32_tflops", 0)),
|
||||||
"tf32": ("tf32", self.specs["tf32_tflops"]),
|
"tf32": ("tf32", self.specs.get("tf32_tflops", 0)),
|
||||||
"fp16": (torch.float16, self.specs["fp16_tflops"]),
|
"fp16": (torch.float16, self.specs.get("fp16_tflops", 0)),
|
||||||
"bf16": (torch.bfloat16, self.specs["bf16_tflops"]),
|
"bf16": (torch.bfloat16, self.specs.get("bf16_tflops", 0)),
|
||||||
"fp8": (torch.float8_e4m3fn, self.specs["fp8_tflops"]),
|
"fp8": (getattr(torch, "float8_e4m3fn", None), self.specs.get("fp8_tflops", 0)),
|
||||||
|
"fp64": (torch.float64, self.specs.get("fp64_tflops", 0)),
|
||||||
|
"int8": (torch.int8, self.specs.get("int8_tflops", 0)),
|
||||||
}
|
}
|
||||||
|
|
||||||
results_by_dtype = {}
|
results_by_dtype = {}
|
||||||
best_shapes = {}
|
|
||||||
per_gpu_results = [{"index": i} for i in range(gpu_count)]
|
per_gpu_results = [{"index": i} for i in range(gpu_count)]
|
||||||
|
|
||||||
with Progress(
|
with Progress(
|
||||||
@ -382,41 +376,27 @@ class Benchmark:
|
|||||||
progress.advance(task)
|
progress.advance(task)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
dtype_val, peak_tflops = dtype_map[dtype_name]
|
gpu_values = []
|
||||||
|
errors = []
|
||||||
# allow_tf32 only affects float32 matmuls: ON for the TF32 run, OFF for
|
for gpu_idx in range(gpu_count):
|
||||||
# the true-FP32 run so the two stay distinct.
|
|
||||||
old_tf32 = torch.backends.cuda.matmul.allow_tf32
|
|
||||||
if dtype_name == "tf32":
|
|
||||||
torch.backends.cuda.matmul.allow_tf32 = True
|
|
||||||
dtype_val = torch.float32
|
|
||||||
elif dtype_name == "fp32":
|
|
||||||
torch.backends.cuda.matmul.allow_tf32 = False
|
|
||||||
|
|
||||||
best_tflops, best_shape, last_err = 0.0, None, None
|
|
||||||
for (M, N, K) in shapes:
|
|
||||||
try:
|
try:
|
||||||
t = self._bench_matmul_once(dtype_name, dtype_val, M, N, K, warmup, iterations)
|
val = self._benchmark_dtype_on_gpu(
|
||||||
if t > best_tflops:
|
dtype_name, dtype_map[dtype_name][0], matrix_size,
|
||||||
best_tflops, best_shape = t, (M, N, K)
|
warmup, compile_warmup, iterations, mm_fn, gpu_idx,
|
||||||
except Exception as e: # noqa: BLE001 - record and try the next shape
|
|
||||||
last_err = e
|
|
||||||
|
|
||||||
torch.backends.cuda.matmul.allow_tf32 = old_tf32
|
|
||||||
|
|
||||||
if best_shape is None:
|
|
||||||
results_by_dtype[dtype_name] = f"error: {last_err}"
|
|
||||||
self.console.print(f"[yellow] {dtype_name}: {last_err}[/yellow]")
|
|
||||||
else:
|
|
||||||
shape_str = "x".join(str(d) for d in best_shape)
|
|
||||||
results_by_dtype[dtype_name] = round(best_tflops, 1)
|
|
||||||
best_shapes[dtype_name] = shape_str
|
|
||||||
for pg in per_gpu_results:
|
|
||||||
pg[dtype_name] = round(best_tflops, 1)
|
|
||||||
if len(shapes) > 1:
|
|
||||||
self.console.print(
|
|
||||||
f"[dim] {dtype_name}: {best_tflops:.1f} TFLOPS @ {shape_str}[/dim]"
|
|
||||||
)
|
)
|
||||||
|
gpu_values.append(val)
|
||||||
|
per_gpu_results[gpu_idx][dtype_name] = round(val, 1)
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(f"gpu{gpu_idx}: {e}")
|
||||||
|
per_gpu_results[gpu_idx][dtype_name] = f"error: {e}"
|
||||||
|
finally:
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
if gpu_values:
|
||||||
|
results_by_dtype[dtype_name] = round(sum(gpu_values) / len(gpu_values), 1)
|
||||||
|
else:
|
||||||
|
results_by_dtype[dtype_name] = "error: " + "; ".join(errors[:3])
|
||||||
|
self.console.print(f"[yellow] {dtype_name}: {results_by_dtype[dtype_name]}[/yellow]")
|
||||||
|
|
||||||
progress.advance(task)
|
progress.advance(task)
|
||||||
|
|
||||||
@ -427,78 +407,119 @@ class Benchmark:
|
|||||||
if peak_tp:
|
if peak_tp:
|
||||||
efficiency[dt] = round((achieved / peak_tp) * 100, 1)
|
efficiency[dt] = round((achieved / peak_tp) * 100, 1)
|
||||||
|
|
||||||
|
consistency = {}
|
||||||
|
for dt in results_by_dtype:
|
||||||
|
vals = [pg.get(dt) for pg in per_gpu_results]
|
||||||
|
nums = [v for v in vals if isinstance(v, (int, float))]
|
||||||
|
if len(nums) >= 2:
|
||||||
|
mean = sum(nums) / len(nums)
|
||||||
|
spread_pct = ((max(nums) - min(nums)) / mean * 100) if mean else 0
|
||||||
|
consistency[dt] = {
|
||||||
|
"mean_tflops": round(mean, 1),
|
||||||
|
"min_tflops": round(min(nums), 1),
|
||||||
|
"max_tflops": round(max(nums), 1),
|
||||||
|
"spread_pct": round(spread_pct, 2),
|
||||||
|
"max_allowed_pct": 3,
|
||||||
|
"passed": spread_pct <= 3,
|
||||||
|
}
|
||||||
|
|
||||||
|
pass_thresholds = dict(self.specs.get("compute_pass_thresholds_tflops") or {})
|
||||||
|
threshold_passed = True
|
||||||
|
for dt, threshold in pass_thresholds.items():
|
||||||
|
val = results_by_dtype.get(dt)
|
||||||
|
if not isinstance(val, (int, float)) or val < threshold:
|
||||||
|
threshold_passed = False
|
||||||
|
break
|
||||||
|
consistency_passed = all(row.get("passed", False) for row in consistency.values()) if consistency else True
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"compute": {
|
"compute": {
|
||||||
|
"passed": threshold_passed and consistency_passed,
|
||||||
"per_dtype_tflops": results_by_dtype,
|
"per_dtype_tflops": results_by_dtype,
|
||||||
"peak_tflops": {dt: dtype_map[dt][1] for dt in dtype_map},
|
"peak_tflops": {dt: dtype_map[dt][1] for dt in dtype_map},
|
||||||
"efficiency_pct": efficiency,
|
"efficiency_pct": efficiency,
|
||||||
# Absolute TFLOPS PASS thresholds (decoupled from peak). When present,
|
# Absolute TFLOPS PASS thresholds (decoupled from peak). When present,
|
||||||
# report.py judges PASS/WARN/FAIL against these directly instead of
|
# report.py judges PASS/WARN/FAIL against these directly instead of
|
||||||
# using % of peak. Empty dict => fall back to legacy 80% rule.
|
# using % of peak. Empty dict => fall back to legacy 80% rule.
|
||||||
"pass_thresholds_tflops": dict(
|
"pass_thresholds_tflops": pass_thresholds,
|
||||||
self.specs.get("compute_pass_thresholds_tflops") or {}
|
|
||||||
),
|
|
||||||
"per_gpu": per_gpu_results,
|
"per_gpu": per_gpu_results,
|
||||||
"sweep_sizes": list(sweep_sizes),
|
"consistency": consistency,
|
||||||
"best_shapes": best_shapes,
|
|
||||||
"matrix_size": matrix_size,
|
"matrix_size": matrix_size,
|
||||||
"warmup": warmup,
|
"warmup": warmup,
|
||||||
"iterations": iterations,
|
"iterations": iterations,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
def _bench_matmul_once(self, dtype_name: str, dtype_val, M: int, N: int, K: int,
|
def _benchmark_dtype_on_gpu(self, dtype_name: str, dtype_val, matrix_size: int,
|
||||||
warmup: int, iterations: int) -> float:
|
warmup: int, compile_warmup: int, iterations: int,
|
||||||
"""Time one (M×K)·(K×N) matmul for a dtype and return achieved TFLOPS.
|
mm_fn, gpu_idx: int) -> float:
|
||||||
|
if dtype_name == "fp8" and dtype_val is None:
|
||||||
|
raise RuntimeError("torch.float8_e4m3fn unavailable")
|
||||||
|
device = f"cuda:{gpu_idx}"
|
||||||
|
old_tf32 = torch.backends.cuda.matmul.allow_tf32
|
||||||
|
try:
|
||||||
|
with torch.cuda.device(gpu_idx):
|
||||||
|
if dtype_name == "tf32":
|
||||||
|
torch.backends.cuda.matmul.allow_tf32 = True
|
||||||
|
dtype_val = torch.float32
|
||||||
|
|
||||||
Uses an L2-cache-busting pool of matrix pairs (total > 256 MB) so operands
|
M = N = K = matrix_size
|
||||||
can't be served from L2 across iterations, and CUDA events for timing. FP8
|
if dtype_name == "int8" and M > 4096:
|
||||||
goes through torch._scaled_mm (e4m3); all others through torch.matmul — eager
|
# torch._int_mm on 8192 can be extremely memory hungry because the
|
||||||
cuBLAS, which on H100 beats torch.compile/Triton for plain GEMM and avoids the
|
# output is int32. Keep it production-visible, but bounded.
|
||||||
per-shape recompile cost that would make a sweep pathologically slow.
|
M = N = K = 4096
|
||||||
"""
|
|
||||||
elem_bytes = 1 if dtype_name == "fp8" else torch.tensor([], dtype=dtype_val).element_size()
|
elem_bytes = 1 if dtype_name in ("fp8", "int8") else torch.tensor([], dtype=dtype_val).element_size()
|
||||||
pair_bytes = (M * K + K * N) * elem_bytes
|
pair_bytes = 2 * M * K * elem_bytes
|
||||||
num_pools = max(4, -(-256 * 1024 * 1024 // pair_bytes)) # ceil(256MB / pair)
|
num_pools = max(4, -(-256 * 1024 * 1024 // pair_bytes))
|
||||||
|
|
||||||
if dtype_name == "fp8":
|
if dtype_name == "fp8":
|
||||||
if not hasattr(torch, "_scaled_mm"):
|
if not hasattr(torch, "_scaled_mm"):
|
||||||
raise RuntimeError("torch._scaled_mm unavailable — upgrade to PyTorch >= 2.1")
|
raise RuntimeError("torch._scaled_mm unavailable")
|
||||||
pools_a = [torch.randn(M, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn) for _ in range(num_pools)]
|
pools_a = [torch.randn(M, K, device=device, dtype=torch.float32).to(torch.float8_e4m3fn) for _ in range(num_pools)]
|
||||||
pools_b = [torch.randn(N, K, device="cuda", dtype=torch.float32).to(torch.float8_e4m3fn) for _ in range(num_pools)]
|
pools_b = [torch.randn(N, K, device=device, dtype=torch.float32).to(torch.float8_e4m3fn) for _ in range(num_pools)]
|
||||||
scale_a = torch.tensor(1.0, device="cuda")
|
scale_a = torch.tensor(1.0, device=device)
|
||||||
scale_b = torch.tensor(1.0, device="cuda")
|
scale_b = torch.tensor(1.0, device=device)
|
||||||
def op(i):
|
|
||||||
|
def run(i):
|
||||||
return torch._scaled_mm(pools_a[i], pools_b[i].T, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.bfloat16)
|
return torch._scaled_mm(pools_a[i], pools_b[i].T, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.bfloat16)
|
||||||
|
|
||||||
|
effective_warmup = warmup
|
||||||
|
elif dtype_name == "int8":
|
||||||
|
if not hasattr(torch, "_int_mm"):
|
||||||
|
raise RuntimeError("torch._int_mm unavailable")
|
||||||
|
pools_a = [torch.randint(-128, 127, (M, K), device=device, dtype=torch.int8) for _ in range(num_pools)]
|
||||||
|
pools_b = [torch.randint(-128, 127, (K, N), device=device, dtype=torch.int8) for _ in range(num_pools)]
|
||||||
|
|
||||||
|
def run(i):
|
||||||
|
return torch._int_mm(pools_a[i], pools_b[i])
|
||||||
|
|
||||||
|
effective_warmup = warmup
|
||||||
else:
|
else:
|
||||||
pools_a = [torch.randn(M, K, device="cuda", dtype=dtype_val) for _ in range(num_pools)]
|
pools_a = [torch.randn(M, K, device=device, dtype=dtype_val) for _ in range(num_pools)]
|
||||||
pools_b = [torch.randn(K, N, device="cuda", dtype=dtype_val) for _ in range(num_pools)]
|
pools_b = [torch.randn(K, N, device=device, dtype=dtype_val) for _ in range(num_pools)]
|
||||||
def op(i):
|
|
||||||
return torch.matmul(pools_a[i], pools_b[i])
|
|
||||||
|
|
||||||
try:
|
def run(i):
|
||||||
# Probe once so a broken/unsupported kernel raises before the timed loop.
|
return mm_fn(pools_a[i], pools_b[i])
|
||||||
_probe = op(0)
|
|
||||||
|
effective_warmup = compile_warmup
|
||||||
|
|
||||||
|
for i in range(effective_warmup):
|
||||||
|
run(i % num_pools)
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
del _probe
|
|
||||||
|
|
||||||
for i in range(warmup):
|
|
||||||
op(i % num_pools)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
start_event = torch.cuda.Event(enable_timing=True)
|
start_event = torch.cuda.Event(enable_timing=True)
|
||||||
end_event = torch.cuda.Event(enable_timing=True)
|
end_event = torch.cuda.Event(enable_timing=True)
|
||||||
start_event.record()
|
start_event.record()
|
||||||
for i in range(iterations):
|
for i in range(iterations):
|
||||||
op(i % num_pools)
|
c = run(i % num_pools)
|
||||||
end_event.record()
|
end_event.record()
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
elapsed_ms = start_event.elapsed_time(end_event)
|
elapsed_ms = start_event.elapsed_time(end_event)
|
||||||
|
del pools_a, pools_b, c
|
||||||
|
flops = 2 * M * N * K * iterations
|
||||||
|
return flops / (elapsed_ms / 1000) / 1e12
|
||||||
finally:
|
finally:
|
||||||
del pools_a, pools_b
|
torch.backends.cuda.matmul.allow_tf32 = old_tf32
|
||||||
torch.cuda.empty_cache()
|
|
||||||
|
|
||||||
return (2 * M * N * K * iterations) / (elapsed_ms / 1000) / 1e12
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def print_results(results: dict, console: Console = None):
|
def print_results(results: dict, console: Console = None):
|
||||||
@ -583,77 +604,24 @@ class Benchmark:
|
|||||||
f"[{ec}]{ef:.1f}%[/{ec}]")
|
f"[{ec}]{ef:.1f}%[/{ec}]")
|
||||||
c.print(table)
|
c.print(table)
|
||||||
|
|
||||||
@staticmethod
|
consistency = comp.get("consistency", {})
|
||||||
def judge_compute(results: dict) -> dict:
|
if consistency:
|
||||||
"""Judge compute results against pass_thresholds_tflops.
|
t_cons = Table(title="Per-GPU Consistency", box=None, padding=(0, 1))
|
||||||
|
t_cons.add_column("DType", style="bold")
|
||||||
Single source of truth for the PASS/WARN/FAIL rule (same one report.py uses):
|
t_cons.add_column("Min", justify="right")
|
||||||
achieved >= thr -> PASS; >= 0.9*thr -> WARN; else FAIL. A string achieved value
|
t_cons.add_column("Mean", justify="right")
|
||||||
(skipped/error) -> SKIP. A dtype without a threshold falls back to efficiency
|
t_cons.add_column("Max", justify="right")
|
||||||
(>=80 PASS / >=50 WARN / else FAIL).
|
t_cons.add_column("Spread", justify="right")
|
||||||
|
t_cons.add_column("Status", justify="right")
|
||||||
Returns {"rows": [(dtype, achieved, threshold, status), ...], "verdict": str}.
|
for dt, row in consistency.items():
|
||||||
"""
|
status = "PASS" if row.get("passed") else "FAIL"
|
||||||
comp = results.get("compute", results)
|
color = "green" if row.get("passed") else "red"
|
||||||
per_dtype = comp.get("per_dtype_tflops", {})
|
t_cons.add_row(
|
||||||
thresholds = comp.get("pass_thresholds_tflops", {}) or {}
|
dt.upper(),
|
||||||
eff = comp.get("efficiency_pct", {})
|
f"{row.get('min_tflops', 0):.1f}",
|
||||||
rank = {"PASS": 0, "WARN": 1, "FAIL": 2, "SKIP": 0}
|
f"{row.get('mean_tflops', 0):.1f}",
|
||||||
rows, verdict = [], "PASS"
|
f"{row.get('max_tflops', 0):.1f}",
|
||||||
for dt, val in per_dtype.items():
|
f"{row.get('spread_pct', 0):.2f}%",
|
||||||
thr = thresholds.get(dt)
|
f"[{color}]{status}[/{color}]",
|
||||||
if isinstance(val, str):
|
)
|
||||||
status = "SKIP"
|
c.print(t_cons)
|
||||||
elif thr:
|
|
||||||
status = "PASS" if val >= thr else ("WARN" if val >= thr * 0.9 else "FAIL")
|
|
||||||
else:
|
|
||||||
e = eff.get(dt, 0)
|
|
||||||
status = "PASS" if e >= 80 else ("WARN" if e >= 50 else "FAIL")
|
|
||||||
rows.append((dt, val, thr, status))
|
|
||||||
if rank[status] > rank[verdict]:
|
|
||||||
verdict = status
|
|
||||||
return {"rows": rows, "verdict": verdict}
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def print_compute_verdict(results: dict, console: Console = None) -> str:
|
|
||||||
"""Print the PASS/WARN/FAIL table for compute results; return the verdict."""
|
|
||||||
c = console or Console()
|
|
||||||
judged = Benchmark.judge_compute(results)
|
|
||||||
color = {"PASS": "green", "WARN": "yellow", "FAIL": "red", "SKIP": "dim"}
|
|
||||||
c.print("\n[bold cyan]Compute Verdict (vs thresholds)[/bold cyan]")
|
|
||||||
for dt, val, thr, status in judged["rows"]:
|
|
||||||
val_s = f"{val:.1f}" if isinstance(val, (int, float)) else str(val)
|
|
||||||
thr_s = f">= {thr}" if thr else "(efficiency)"
|
|
||||||
c.print(f" {dt.upper():>4}: {val_s:>8} {thr_s:<12} [{color[status]}]{status}[/{color[status]}]")
|
|
||||||
v = judged["verdict"]
|
|
||||||
c.print(f" [bold]VERDICT: [{color[v]}]{v}[/{color[v]}][/bold]")
|
|
||||||
return v
|
|
||||||
|
|
||||||
|
|
||||||
def _run_cli() -> None:
|
|
||||||
"""`python -m modules.benchmark` — run ONLY the compute-throughput benchmark."""
|
|
||||||
import argparse
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import yaml
|
|
||||||
|
|
||||||
repo_root = Path(__file__).resolve().parent.parent
|
|
||||||
parser = argparse.ArgumentParser(description="Run the compute-throughput benchmark only.")
|
|
||||||
parser.add_argument("--config", default=str(repo_root / "configs" / "default.yaml"),
|
|
||||||
help="path to config YAML (default: configs/default.yaml)")
|
|
||||||
parser.add_argument("--json", action="store_true", help="also print raw JSON of the compute results")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
with open(args.config) as f:
|
|
||||||
config = yaml.safe_load(f) or {}
|
|
||||||
|
|
||||||
results = Benchmark(config).run_compute_benchmark()
|
|
||||||
Benchmark.print_results(results)
|
|
||||||
Benchmark.print_compute_verdict(results)
|
|
||||||
|
|
||||||
if args.json:
|
|
||||||
print("JSON_RESULT:" + json.dumps(results["compute"]))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
_run_cli()
|
|
||||||
|
|||||||
@ -11,7 +11,6 @@ GPU_NAME_PATTERNS = {
|
|||||||
"A100": "a100",
|
"A100": "a100",
|
||||||
"A800": "a800",
|
"A800": "a800",
|
||||||
"H100": "h100",
|
"H100": "h100",
|
||||||
"H800": "h800", # H800 = H100 SXM with NVLink halved (400 GB/s) and FP64 restricted
|
|
||||||
"H200": "h200",
|
"H200": "h200",
|
||||||
"H20": "h20", # H20 / H20-3e is the China-compliance export variant, REDUCED peaks
|
"H20": "h20", # H20 / H20-3e is the China-compliance export variant, REDUCED peaks
|
||||||
"B200": "b200",
|
"B200": "b200",
|
||||||
@ -36,17 +35,11 @@ GPU_SPECS = {
|
|||||||
"fp16_tflops": 990, # dense (1979 sparse w/ 2:4)
|
"fp16_tflops": 990, # dense (1979 sparse w/ 2:4)
|
||||||
"bf16_tflops": 990, # dense
|
"bf16_tflops": 990, # dense
|
||||||
"fp8_tflops": 1979, # dense
|
"fp8_tflops": 1979, # dense
|
||||||
|
"fp64_tflops": 67,
|
||||||
|
"int8_tflops": 1979,
|
||||||
"compute_pass_thresholds_tflops": {
|
"compute_pass_thresholds_tflops": {
|
||||||
# Recalibrated 2026-05-25 to the H100 eager-cuBLAS achievable floor (each
|
"fp32": 54, "tf32": 444, "fp16": 734, "bf16": 745, "fp8": 1400,
|
||||||
# threshold ~2-4% below the sustained value measured across 16 GPUs via the
|
"fp64": 63, "int8": 1536,
|
||||||
# MAMF shape sweep: fp32 ~52 / tf32 ~405 / fp16 ~732-748 / bf16 ~747-758 /
|
|
||||||
# fp8 ~1248-1271). The old marketing/MAMF-derived values (fp32 54, tf32 444,
|
|
||||||
# fp16 734, bf16 745, fp8 1400) sat ON or ABOVE what PyTorch cuBLAS reaches
|
|
||||||
# on H100, so healthy cards flaked to WARN/FAIL. fp8 1400 in particular was
|
|
||||||
# an H200/rowwise-scaling figure; H100 tensorwise _scaled_mm tops out ~1310.
|
|
||||||
"fp32": 50, "tf32": 385, "fp16": 720, "bf16": 730, "fp8": 1200,
|
|
||||||
# FP64 63 / INT8 1536 — listed for documentation; benchmark module
|
|
||||||
# doesn't currently exercise these dtypes.
|
|
||||||
},
|
},
|
||||||
"tdp_watts": 700,
|
"tdp_watts": 700,
|
||||||
"nvlink_gen": 4,
|
"nvlink_gen": 4,
|
||||||
@ -67,51 +60,10 @@ GPU_SPECS = {
|
|||||||
"fp16_tflops": 990, # dense
|
"fp16_tflops": 990, # dense
|
||||||
"bf16_tflops": 990, # dense
|
"bf16_tflops": 990, # dense
|
||||||
"fp8_tflops": 1979, # dense
|
"fp8_tflops": 1979, # dense
|
||||||
# PASS thresholds aligned with H200_production_acceptance.md v2 (2026-05-21):
|
|
||||||
# calibrated against Semianalysis & stas00 MAMF — H200 shares H100 SMs so
|
|
||||||
# achievable TFLOPS in PyTorch is in the same band.
|
|
||||||
"compute_pass_thresholds_tflops": {
|
|
||||||
"fp32": 50, "tf32": 400, "fp16": 720, "bf16": 720, "fp8": 1400,
|
|
||||||
},
|
|
||||||
"tdp_watts": 700,
|
"tdp_watts": 700,
|
||||||
"nvlink_gen": 4,
|
"nvlink_gen": 4,
|
||||||
"nvlink_bandwidth_gbps": 900,
|
"nvlink_bandwidth_gbps": 900,
|
||||||
"pcie_gen": 5,
|
"pcie_gen": 5,
|
||||||
"min_driver_version": "545",
|
|
||||||
"min_cuda_version": "12.4",
|
|
||||||
},
|
|
||||||
"h800": {
|
|
||||||
# H800 = China-compliance export variant of H100 SXM5. SAME chip / SMs /
|
|
||||||
# clocks / HBM as H100 SXM5 — Tensor Core peaks (FP16 / BF16 / FP8 / TF32 /
|
|
||||||
# FP32) are identical to H100. Two restrictions vs H100:
|
|
||||||
# 1. NVLink bandwidth halved: 400 GB/s bidirectional (vs H100 900 GB/s)
|
|
||||||
# 2. FP64 throughput severely cut to ~1 TFLOPS (vs H100 34/67 TFLOPS)
|
|
||||||
# All other interfaces (PCIe Gen5, NVSwitch, HBM3 80GB @ 3.35 TB/s) match H100.
|
|
||||||
# NCCL multi-GPU thresholds MUST be downscaled because NVLink BW is halved.
|
|
||||||
"full_name": "NVIDIA H800 SXM5",
|
|
||||||
"architecture": "Hopper",
|
|
||||||
"compute_capability": 9.0,
|
|
||||||
"hbm_capacity_gb": 80,
|
|
||||||
"hbm_type": "HBM3",
|
|
||||||
"memory_bandwidth_gbps": 3350, # GB/s (3.35 TB/s) — same as H100 SXM
|
|
||||||
"fp32_tflops": 67,
|
|
||||||
"tf32_tflops": 495, # dense (same as H100)
|
|
||||||
"fp16_tflops": 990, # dense (same as H100)
|
|
||||||
"bf16_tflops": 990, # dense (same as H100)
|
|
||||||
"fp8_tflops": 1979, # dense (same as H100)
|
|
||||||
# Tensor Core peaks identical to H100, so PASS thresholds reuse the H100
|
|
||||||
# eager-cuBLAS calibration (2026-05-25). Measured on 8×H800: fp32 ~52 /
|
|
||||||
# tf32 ~420 / fp16 ~741 / bf16 ~745 / fp8 ~1249 — all clear these. fp8 was
|
|
||||||
# 1400 (an H200/rowwise-scaling figure) which PyTorch tensorwise _scaled_mm
|
|
||||||
# can't reach on H100-class silicon (~1310 ceiling); lowered to 1200 to match
|
|
||||||
# h100. FP64 deliberately NOT listed — H800 is restricted to ~1 TFLOPS FP64.
|
|
||||||
"compute_pass_thresholds_tflops": {
|
|
||||||
"fp32": 50, "tf32": 385, "fp16": 720, "bf16": 730, "fp8": 1200,
|
|
||||||
},
|
|
||||||
"tdp_watts": 700,
|
|
||||||
"nvlink_gen": 4,
|
|
||||||
"nvlink_bandwidth_gbps": 400, # bidirectional — HALF of H100 (export restriction)
|
|
||||||
"pcie_gen": 5,
|
|
||||||
"min_driver_version": "535",
|
"min_driver_version": "535",
|
||||||
"min_cuda_version": "12.1",
|
"min_cuda_version": "12.1",
|
||||||
},
|
},
|
||||||
|
|||||||
@ -1,15 +1,17 @@
|
|||||||
"""RDMA / InfiniBand bandwidth and latency test module."""
|
"""RDMA / InfiniBand bandwidth and latency test module."""
|
||||||
|
|
||||||
|
import glob
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional, List
|
from typing import Optional, List
|
||||||
|
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
from rich.table import Table
|
from rich.table import Table
|
||||||
|
|
||||||
|
from modules.gpu_specs import resolve_tools_dir
|
||||||
|
|
||||||
|
|
||||||
class RDMATest:
|
class RDMATest:
|
||||||
|
|
||||||
@ -17,11 +19,24 @@ class RDMATest:
|
|||||||
self.config = config
|
self.config = config
|
||||||
self.console = Console()
|
self.console = Console()
|
||||||
self.rdma_cfg = config.get("rdma", {})
|
self.rdma_cfg = config.get("rdma", {})
|
||||||
|
self.tools_dir = resolve_tools_dir(config)
|
||||||
|
|
||||||
def _find_tool(self, name: str) -> Optional[str]:
|
def _find_tool(self, name: str) -> Optional[str]:
|
||||||
p = shutil.which(name)
|
p = shutil.which(name)
|
||||||
if p:
|
if p:
|
||||||
return p
|
return p
|
||||||
|
candidates = [
|
||||||
|
os.path.join(self.tools_dir, "perftest", name),
|
||||||
|
os.path.join(self.tools_dir, "perftest", "bin", name),
|
||||||
|
os.path.join(self.tools_dir, "rdma", name),
|
||||||
|
os.path.join(self.tools_dir, name),
|
||||||
|
]
|
||||||
|
for path in candidates:
|
||||||
|
if os.path.isfile(path) and os.access(path, os.X_OK):
|
||||||
|
return path
|
||||||
|
for path in glob.glob(os.path.join(self.tools_dir, "**", name), recursive=True):
|
||||||
|
if os.path.isfile(path) and os.access(path, os.X_OK):
|
||||||
|
return path
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _get_ib_devices(self) -> List[str]:
|
def _get_ib_devices(self) -> List[str]:
|
||||||
@ -101,26 +116,40 @@ class RDMATest:
|
|||||||
|
|
||||||
self.console.print(f"[cyan]RDMA Test - Devices: {', '.join(devices)}[/cyan]")
|
self.console.print(f"[cyan]RDMA Test - Devices: {', '.join(devices)}[/cyan]")
|
||||||
|
|
||||||
bw_results = self._run_bandwidth_tests(devices)
|
active_pairs = [
|
||||||
latency_results = self._run_latency_tests(devices)
|
(dev, port) for dev, port in ib_devices
|
||||||
|
if "ACTIVE" in self._read_sys(f"/sys/class/infiniband/{dev}/ports/{port}/state").upper()
|
||||||
|
]
|
||||||
|
port_checks = self._evaluate_port_checks(device_info)
|
||||||
|
test_devices = [dev for dev, _ in active_pairs]
|
||||||
|
bw_results = self._run_bandwidth_tests(test_devices)
|
||||||
|
latency_results = self._run_latency_tests(test_devices)
|
||||||
|
ibping_results = self._run_ibping_tests(active_pairs)
|
||||||
|
fabric_counters = self._collect_pfc_ecn_counters() if self.rdma_cfg.get("pfc_ecn_counters", True) else {}
|
||||||
|
failures = self._failure_reasons(port_checks, bw_results, latency_results, ibping_results, fabric_counters)
|
||||||
|
|
||||||
|
fabric_counters_missing = (
|
||||||
|
self.rdma_cfg.get("pfc_ecn_counters", True)
|
||||||
|
and fabric_counters
|
||||||
|
and not fabric_counters.get("counters")
|
||||||
|
)
|
||||||
all_passed = all(
|
all_passed = all(
|
||||||
r.get("status") == "PASS"
|
r.get("status") == "PASS"
|
||||||
for r in bw_results + latency_results
|
for r in bw_results + latency_results + ibping_results
|
||||||
if isinstance(r, dict)
|
if isinstance(r, dict)
|
||||||
)
|
) and all(p.get("status") == "PASS" for p in port_checks) and not fabric_counters.get("failed", False) and not fabric_counters_missing
|
||||||
|
|
||||||
result = {
|
return {
|
||||||
"passed": all_passed,
|
"passed": all_passed,
|
||||||
"devices": device_info,
|
"devices": device_info,
|
||||||
|
"port_checks": port_checks,
|
||||||
"bandwidth_tests": bw_results,
|
"bandwidth_tests": bw_results,
|
||||||
"latency_tests": latency_results,
|
"latency_tests": latency_results,
|
||||||
|
"ibping_tests": ibping_results,
|
||||||
|
"fabric_counters": fabric_counters,
|
||||||
|
"failures": failures,
|
||||||
"timestamp": datetime.now().isoformat(),
|
"timestamp": datetime.now().isoformat(),
|
||||||
}
|
}
|
||||||
# Cross-node (two-host) RDMA, run only when a peer is configured.
|
|
||||||
if (self.rdma_cfg.get("cross_node", {}) or {}).get("enabled"):
|
|
||||||
result["cross_node"] = self.run_cross_node()
|
|
||||||
return result
|
|
||||||
|
|
||||||
def _collect_device_info(self, devices: List[str]) -> List[dict]:
|
def _collect_device_info(self, devices: List[str]) -> List[dict]:
|
||||||
info = []
|
info = []
|
||||||
@ -141,11 +170,83 @@ class RDMATest:
|
|||||||
port_info[label] = f.read().strip()
|
port_info[label] = f.read().strip()
|
||||||
except (FileNotFoundError, PermissionError):
|
except (FileNotFoundError, PermissionError):
|
||||||
port_info[label] = "N/A"
|
port_info[label] = "N/A"
|
||||||
|
port_info["link_layer"] = self._read_sys(
|
||||||
|
f"/sys/class/infiniband/{dev}/ports/{port}/link_layer"
|
||||||
|
) or "N/A"
|
||||||
|
|
||||||
dev_info["ports"].append(port_info)
|
dev_info["ports"].append(port_info)
|
||||||
info.append(dev_info)
|
info.append(dev_info)
|
||||||
return info
|
return info
|
||||||
|
|
||||||
|
def _evaluate_port_checks(self, device_info: List[dict]) -> List[dict]:
|
||||||
|
checks = []
|
||||||
|
min_rate = float(self.rdma_cfg.get("min_port_rate_gbps", 400))
|
||||||
|
for dev in device_info:
|
||||||
|
for port in dev.get("ports", []):
|
||||||
|
if port.get("link_layer") != "InfiniBand":
|
||||||
|
continue
|
||||||
|
state = port.get("state", "")
|
||||||
|
rate = port.get("rate", "")
|
||||||
|
rate_gbps = self._parse_rate_gbps(rate)
|
||||||
|
status = "PASS" if "ACTIVE" in state.upper() and rate_gbps >= min_rate else "FAIL"
|
||||||
|
checks.append({
|
||||||
|
"device": dev.get("name"),
|
||||||
|
"port": port.get("port"),
|
||||||
|
"state": state,
|
||||||
|
"rate": rate,
|
||||||
|
"rate_gbps": rate_gbps,
|
||||||
|
"min_rate_gbps": min_rate,
|
||||||
|
"status": status,
|
||||||
|
})
|
||||||
|
return checks
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_rate_gbps(rate: str) -> float:
|
||||||
|
# Example: "400 Gb/sec (4X NDR)"
|
||||||
|
try:
|
||||||
|
return float(str(rate).split()[0])
|
||||||
|
except (ValueError, IndexError, AttributeError):
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _failure_reasons(port_checks: List[dict], bw_results: List[dict],
|
||||||
|
latency_results: List[dict], ibping_results: List[dict],
|
||||||
|
fabric_counters: dict) -> List[str]:
|
||||||
|
failures = []
|
||||||
|
for p in port_checks:
|
||||||
|
if p.get("status") != "PASS":
|
||||||
|
failures.append(
|
||||||
|
f"{p.get('device')} port {p.get('port')} state/rate failed "
|
||||||
|
f"({p.get('state')}, {p.get('rate')}; required >= {p.get('min_rate_gbps')}Gbps ACTIVE)"
|
||||||
|
)
|
||||||
|
for r in bw_results:
|
||||||
|
if r.get("status") != "PASS":
|
||||||
|
if r.get("error"):
|
||||||
|
failures.append(f"{r.get('test')} failed: {r.get('error')}")
|
||||||
|
else:
|
||||||
|
failures.append(
|
||||||
|
f"{r.get('test')} bandwidth {r.get('bandwidth_gbps', 0)}GB/s "
|
||||||
|
f"< {r.get('min_required_gbps', 'N/A')}GB/s"
|
||||||
|
)
|
||||||
|
for r in latency_results:
|
||||||
|
if r.get("status") != "PASS":
|
||||||
|
if r.get("error"):
|
||||||
|
failures.append(f"{r.get('test')} failed: {r.get('error')}")
|
||||||
|
else:
|
||||||
|
failures.append(
|
||||||
|
f"{r.get('test')} latency {r.get('latency_us', 0)}us "
|
||||||
|
f"> {r.get('max_allowed_us', 'N/A')}us"
|
||||||
|
)
|
||||||
|
for r in ibping_results:
|
||||||
|
if r.get("status") != "PASS":
|
||||||
|
failures.append(f"{r.get('test')} failed: {r.get('error') or r.get('output_tail', '')[:120]}")
|
||||||
|
if fabric_counters.get("failed"):
|
||||||
|
nonzero = [f"{k}={v}" for k, v in fabric_counters.get("counters", {}).items() if v]
|
||||||
|
failures.append("non-zero PFC/ECN/CNP/congestion counters: " + ", ".join(nonzero[:10]))
|
||||||
|
elif fabric_counters and not fabric_counters.get("counters"):
|
||||||
|
failures.append("PFC/ECN/CNP/congestion counters not found; fabric counter evidence missing")
|
||||||
|
return failures
|
||||||
|
|
||||||
def _run_ib_command(self, cmd: List[str], timeout: int = 60) -> dict:
|
def _run_ib_command(self, cmd: List[str], timeout: int = 60) -> dict:
|
||||||
try:
|
try:
|
||||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
|
r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
|
||||||
@ -168,44 +269,69 @@ class RDMATest:
|
|||||||
iters = self.rdma_cfg.get("ib_iterations", 1000)
|
iters = self.rdma_cfg.get("ib_iterations", 1000)
|
||||||
dx = self.rdma_cfg.get("ib_device", None)
|
dx = self.rdma_cfg.get("ib_device", None)
|
||||||
port = self.rdma_cfg.get("ib_port", 1)
|
port = self.rdma_cfg.get("ib_port", 1)
|
||||||
|
server_addr = self.rdma_cfg.get("server_addr") or os.environ.get("RDMA_SERVER_ADDR")
|
||||||
|
role = self.rdma_cfg.get("role", "auto")
|
||||||
|
|
||||||
for tool, label in [(ib_write_bw, "ib_write_bw"), (ib_read_bw, "ib_read_bw")]:
|
for tool, label in [(ib_write_bw, "ib_write_bw"), (ib_read_bw, "ib_read_bw")]:
|
||||||
if not tool:
|
if not tool:
|
||||||
results.append({"test": label, "status": "SKIP", "error": "not installed"})
|
results.append({"test": label, "status": "FAIL", "error": "not installed"})
|
||||||
|
continue
|
||||||
|
if role == "client" and not server_addr:
|
||||||
|
results.append({
|
||||||
|
"test": label,
|
||||||
|
"status": "FAIL",
|
||||||
|
"error": "rdma.role=client requires rdma.server_addr or RDMA_SERVER_ADDR",
|
||||||
|
"role": "client",
|
||||||
|
})
|
||||||
continue
|
continue
|
||||||
|
|
||||||
server_cmd = [tool, "-d", dx or devices[0], "-i", str(port), "-s", str(msg_size)]
|
server_cmd = [tool, "-d", dx or devices[0], "-i", str(port), "-s", str(msg_size), "-n", str(iters)]
|
||||||
client_cmd = server_cmd + ["localhost"]
|
client_cmd = server_cmd + [server_addr or "localhost"]
|
||||||
|
|
||||||
|
if role == "server":
|
||||||
|
results.append(self._run_server_mode(label, server_cmd))
|
||||||
|
continue
|
||||||
|
|
||||||
|
server = None
|
||||||
|
if not server_addr and role != "client":
|
||||||
server = subprocess.Popen(server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
server = subprocess.Popen(server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||||
import time
|
import time
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
client = subprocess.run(client_cmd, capture_output=True, text=True, timeout=60)
|
client = subprocess.run(client_cmd, capture_output=True, text=True, timeout=60)
|
||||||
|
if server:
|
||||||
server.wait(timeout=10)
|
server.wait(timeout=10)
|
||||||
|
|
||||||
output = client.stdout + server.stdout.read() if server.stdout else ""
|
output = client.stdout
|
||||||
bw_mbps = 0
|
if server and server.stdout:
|
||||||
|
output += server.stdout.read()
|
||||||
|
bw_mibps = 0
|
||||||
for line in output.split("\n"):
|
for line in output.split("\n"):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
parts = line.split()
|
parts = line.split()
|
||||||
try:
|
try:
|
||||||
bw_mbps = max(bw_mbps, float(parts[-1]))
|
if len(parts) >= 5 and int(parts[0]) == int(msg_size):
|
||||||
|
# perftest bandwidth rows:
|
||||||
|
# #bytes #iterations BW peak[MiB/sec] BW average[MiB/sec] MsgRate[Mpps]
|
||||||
|
bw_mibps = max(bw_mibps, float(parts[3]))
|
||||||
except (ValueError, IndexError):
|
except (ValueError, IndexError):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
bw_gbps = bw_mbps / 1000 if bw_mbps else 0
|
bw_gbps = bw_mibps * 1024 * 1024 / 1e9 if bw_mibps else 0
|
||||||
status = "PASS" if bw_gbps >= min_bw else "WARN"
|
status = "PASS" if bw_gbps >= min_bw else "FAIL"
|
||||||
results.append({
|
results.append({
|
||||||
"test": label,
|
"test": label,
|
||||||
"status": status,
|
"status": status,
|
||||||
"bandwidth_gbps": round(bw_gbps, 2),
|
"bandwidth_gbps": round(bw_gbps, 2),
|
||||||
"min_required_gbps": min_bw,
|
"min_required_gbps": min_bw,
|
||||||
|
"msg_size": msg_size,
|
||||||
|
"role": "client" if server_addr else "local_loopback",
|
||||||
})
|
})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if server:
|
||||||
server.kill()
|
server.kill()
|
||||||
results.append({"test": label, "status": "FAIL", "error": str(e)})
|
results.append({"test": label, "status": "FAIL", "error": str(e)})
|
||||||
|
|
||||||
@ -216,240 +342,214 @@ class RDMATest:
|
|||||||
ib_write_lat = self._find_tool("ib_write_lat")
|
ib_write_lat = self._find_tool("ib_write_lat")
|
||||||
ib_read_lat = self._find_tool("ib_read_lat")
|
ib_read_lat = self._find_tool("ib_read_lat")
|
||||||
max_lat_us = self.rdma_cfg.get("max_latency_us", 10)
|
max_lat_us = self.rdma_cfg.get("max_latency_us", 10)
|
||||||
|
max_by_test = {
|
||||||
|
"ib_write_lat": self.rdma_cfg.get("max_write_latency_us", max_lat_us),
|
||||||
|
"ib_read_lat": self.rdma_cfg.get("max_read_latency_us", max_lat_us),
|
||||||
|
}
|
||||||
dx = self.rdma_cfg.get("ib_device", None)
|
dx = self.rdma_cfg.get("ib_device", None)
|
||||||
port = self.rdma_cfg.get("ib_port", 1)
|
port = self.rdma_cfg.get("ib_port", 1)
|
||||||
|
msg_size = self.rdma_cfg.get("latency_msg_size", 8)
|
||||||
|
iters = self.rdma_cfg.get("ib_iterations", 1000)
|
||||||
|
server_addr = self.rdma_cfg.get("server_addr") or os.environ.get("RDMA_SERVER_ADDR")
|
||||||
|
role = self.rdma_cfg.get("role", "auto")
|
||||||
|
|
||||||
for tool, label in [(ib_write_lat, "ib_write_lat"), (ib_read_lat, "ib_read_lat")]:
|
for tool, label in [(ib_write_lat, "ib_write_lat"), (ib_read_lat, "ib_read_lat")]:
|
||||||
if not tool:
|
if not tool:
|
||||||
results.append({"test": label, "status": "SKIP", "error": "not installed"})
|
results.append({"test": label, "status": "FAIL", "error": "not installed"})
|
||||||
|
continue
|
||||||
|
if role == "client" and not server_addr:
|
||||||
|
results.append({
|
||||||
|
"test": label,
|
||||||
|
"status": "FAIL",
|
||||||
|
"error": "rdma.role=client requires rdma.server_addr or RDMA_SERVER_ADDR",
|
||||||
|
"role": "client",
|
||||||
|
})
|
||||||
continue
|
continue
|
||||||
|
|
||||||
server_cmd = [tool, "-d", dx or devices[0], "-i", str(port)]
|
server_cmd = [tool, "-d", dx or devices[0], "-i", str(port), "-s", str(msg_size), "-n", str(iters)]
|
||||||
client_cmd = server_cmd + ["localhost"]
|
client_cmd = server_cmd + [server_addr or "localhost"]
|
||||||
|
|
||||||
|
if role == "server":
|
||||||
|
results.append(self._run_server_mode(label, server_cmd))
|
||||||
|
continue
|
||||||
|
|
||||||
|
server = None
|
||||||
|
if not server_addr and role != "client":
|
||||||
server = subprocess.Popen(server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
server = subprocess.Popen(server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||||
import time
|
import time
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
client = subprocess.run(client_cmd, capture_output=True, text=True, timeout=60)
|
client = subprocess.run(client_cmd, capture_output=True, text=True, timeout=60)
|
||||||
|
if server:
|
||||||
server.wait(timeout=10)
|
server.wait(timeout=10)
|
||||||
|
|
||||||
output = client.stdout + server.stdout.read() if server.stdout else ""
|
output = client.stdout
|
||||||
|
if server and server.stdout:
|
||||||
|
output += server.stdout.read()
|
||||||
lat_us = 0
|
lat_us = 0
|
||||||
for line in output.split("\n"):
|
for line in output.split("\n"):
|
||||||
parts = line.strip().split()
|
parts = line.strip().split()
|
||||||
try:
|
try:
|
||||||
lat_us = max(lat_us, float(parts[-1]))
|
if len(parts) >= 6:
|
||||||
|
int(parts[0])
|
||||||
|
int(parts[1])
|
||||||
|
# perftest latency rows:
|
||||||
|
# #bytes #iterations t_min t_max t_typical t_avg t_stdev p99 p99.9
|
||||||
|
lat_us = max(lat_us, float(parts[5]))
|
||||||
except (ValueError, IndexError):
|
except (ValueError, IndexError):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
status = "PASS" if 0 < lat_us <= max_lat_us else ("WARN" if lat_us > 0 else "FAIL")
|
max_allowed = max_by_test[label]
|
||||||
|
status = "PASS" if 0 < lat_us <= max_allowed else "FAIL"
|
||||||
results.append({
|
results.append({
|
||||||
"test": label,
|
"test": label,
|
||||||
"status": status,
|
"status": status,
|
||||||
"latency_us": round(lat_us, 2),
|
"latency_us": round(lat_us, 2),
|
||||||
"max_allowed_us": max_lat_us,
|
"max_allowed_us": max_allowed,
|
||||||
|
"msg_size": msg_size,
|
||||||
|
"role": "client" if server_addr else "local_loopback",
|
||||||
})
|
})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if server:
|
||||||
server.kill()
|
server.kill()
|
||||||
results.append({"test": label, "status": "FAIL", "error": str(e)})
|
results.append({"test": label, "status": "FAIL", "error": str(e)})
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
def _run_server_mode(self, label: str, server_cmd: List[str]) -> dict:
|
||||||
# Cross-node (two-host) RDMA over perftest, orchestrated via SSH.
|
timeout = int(self.rdma_cfg.get("server_timeout_sec", 120))
|
||||||
# Runs FROM the client host: for each IB device it launches the matching
|
try:
|
||||||
# perftest server on the peer over SSH (held open in a live ssh channel),
|
r = subprocess.run(server_cmd, capture_output=True, text=True, timeout=timeout)
|
||||||
# then runs the local client against the peer's OOB address and parses the
|
|
||||||
# result. Replaces the old standalone scripts/rdma_cross_node.sh.
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _active_ib_devices(self) -> List[str]:
|
|
||||||
"""IB devices whose port 1 is InfiniBand link_layer and ACTIVE."""
|
|
||||||
out = []
|
|
||||||
for dev in self._get_ib_devices():
|
|
||||||
for port in self._get_ib_ports(dev):
|
|
||||||
ll = self._read_sys(f"/sys/class/infiniband/{dev}/ports/{port}/link_layer")
|
|
||||||
st = self._read_sys(f"/sys/class/infiniband/{dev}/ports/{port}/state")
|
|
||||||
if ll == "InfiniBand" and "ACTIVE" in st.upper():
|
|
||||||
out.append(dev)
|
|
||||||
break
|
|
||||||
return out
|
|
||||||
|
|
||||||
def run_cross_node(self) -> dict:
|
|
||||||
cn = self.rdma_cfg.get("cross_node", {}) or {}
|
|
||||||
if not cn.get("enabled"):
|
|
||||||
return {"status": "SKIP", "skipped": True,
|
|
||||||
"reason": "rdma.cross_node.enabled is false"}
|
|
||||||
|
|
||||||
server = cn.get("server")
|
|
||||||
if not server:
|
|
||||||
return {"status": "SKIP", "skipped": True,
|
|
||||||
"reason": "rdma.cross_node.server (peer ssh address) not set"}
|
|
||||||
|
|
||||||
ssh_user = cn.get("ssh_user", "root")
|
|
||||||
server_target = server if "@" in server else f"{ssh_user}@{server}"
|
|
||||||
# OOB address the client's perftest connects to (defaults to the ssh host).
|
|
||||||
server_addr = cn.get("server_addr") or server.split("@")[-1]
|
|
||||||
ib_port = cn.get("ib_port", 1)
|
|
||||||
gid_index = cn.get("gid_index")
|
|
||||||
msg_size = cn.get("msg_size", 1048576)
|
|
||||||
iters = cn.get("iters", 5000)
|
|
||||||
base_port = cn.get("base_oob_port", 18515)
|
|
||||||
warmup = cn.get("server_warmup_sec", 2.0)
|
|
||||||
min_bw = cn.get("min_bandwidth_gbps", 350)
|
|
||||||
max_lat = cn.get("max_latency_us", 5)
|
|
||||||
|
|
||||||
devices = cn.get("devices") or self._active_ib_devices()
|
|
||||||
if not devices:
|
|
||||||
return {"status": "SKIP", "skipped": True,
|
|
||||||
"reason": "no active InfiniBand devices to test"}
|
|
||||||
|
|
||||||
has_bw = self._find_tool("ib_write_bw") is not None
|
|
||||||
has_lat = self._find_tool("ib_write_lat") is not None
|
|
||||||
if not has_bw and not has_lat:
|
|
||||||
return {"status": "SKIP", "skipped": True,
|
|
||||||
"reason": "perftest (ib_write_bw / ib_write_lat) not installed"}
|
|
||||||
|
|
||||||
self.console.print(
|
|
||||||
f"[cyan]Cross-node RDMA — client → {server_addr}, "
|
|
||||||
f"devices: {', '.join(devices)}[/cyan]")
|
|
||||||
|
|
||||||
per_device = []
|
|
||||||
for idx, dev in enumerate(devices):
|
|
||||||
oob = base_port + idx
|
|
||||||
entry = {"device": dev}
|
|
||||||
|
|
||||||
if has_bw:
|
|
||||||
bw = self._cross_node_perftest(
|
|
||||||
"ib_write_bw", dev, server_target, server_addr, ib_port,
|
|
||||||
oob, gid_index, warmup,
|
|
||||||
extra=["--report_gbits", "-s", str(msg_size), "-n", str(iters)],
|
|
||||||
parse="bw")
|
|
||||||
entry["bandwidth_gbps"] = bw
|
|
||||||
if isinstance(bw, (int, float)):
|
|
||||||
entry["bw_status"] = "PASS" if bw >= min_bw else "WARN"
|
|
||||||
else:
|
|
||||||
entry["bw_status"] = "FAIL"
|
|
||||||
|
|
||||||
if has_lat:
|
|
||||||
lat = self._cross_node_perftest(
|
|
||||||
"ib_write_lat", dev, server_target, server_addr, ib_port,
|
|
||||||
oob, gid_index, warmup, extra=[], parse="lat")
|
|
||||||
if isinstance(lat, dict):
|
|
||||||
entry["latency_us"] = lat.get("typical")
|
|
||||||
entry["latency_p99_us"] = lat.get("p99")
|
|
||||||
t = lat.get("typical")
|
|
||||||
entry["lat_status"] = ("PASS" if isinstance(t, (int, float)) and 0 < t <= max_lat
|
|
||||||
else ("WARN" if isinstance(t, (int, float)) else "FAIL"))
|
|
||||||
else:
|
|
||||||
entry["latency_us"] = lat
|
|
||||||
entry["lat_status"] = "FAIL"
|
|
||||||
|
|
||||||
per_device.append(entry)
|
|
||||||
|
|
||||||
statuses = [e.get(k) for e in per_device for k in ("bw_status", "lat_status") if e.get(k)]
|
|
||||||
verdict = "PASS"
|
|
||||||
for s in statuses:
|
|
||||||
if s == "FAIL":
|
|
||||||
verdict = "FAIL"
|
|
||||||
break
|
|
||||||
if s == "WARN" and verdict == "PASS":
|
|
||||||
verdict = "WARN"
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"status": verdict,
|
"test": label,
|
||||||
"server": server_addr,
|
"status": "PASS" if r.returncode == 0 else "FAIL",
|
||||||
"min_bandwidth_gbps": min_bw,
|
"role": "server",
|
||||||
"max_latency_us": max_lat,
|
"server_timeout_sec": timeout,
|
||||||
"per_device": per_device,
|
"output_tail": (r.stdout + r.stderr)[-500:],
|
||||||
"timestamp": datetime.now().isoformat(),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def _cross_node_perftest(self, tool: str, dev: str, server_target: str,
|
|
||||||
server_addr: str, ib_port: int, oob_port: int,
|
|
||||||
gid_index, warmup: float, extra: List[str], parse: str):
|
|
||||||
"""Start `tool` server on the peer via SSH, run the local client, parse output.
|
|
||||||
|
|
||||||
Returns a float (bw, Gb/s), a dict {typical, p99} (lat, µs), or an error string.
|
|
||||||
"""
|
|
||||||
tool_path = self._find_tool(tool)
|
|
||||||
if not tool_path:
|
|
||||||
return f"{tool} not installed"
|
|
||||||
|
|
||||||
flags = ["-d", dev, "-i", str(ib_port), "-p", str(oob_port), "-F"]
|
|
||||||
if gid_index is not None:
|
|
||||||
flags += ["-x", str(gid_index)]
|
|
||||||
flags += extra
|
|
||||||
|
|
||||||
server_cmd = " ".join([tool] + flags) # server: no host argument
|
|
||||||
server_proc = None
|
|
||||||
try:
|
|
||||||
server_proc = subprocess.Popen(
|
|
||||||
["ssh", "-o", "BatchMode=yes", "-o", "StrictHostKeyChecking=no",
|
|
||||||
server_target, server_cmd],
|
|
||||||
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
|
|
||||||
time.sleep(warmup) # let the remote server bind before the client connects
|
|
||||||
|
|
||||||
client = subprocess.run([tool_path] + flags + [server_addr],
|
|
||||||
capture_output=True, text=True, timeout=120)
|
|
||||||
out = client.stdout + "\n" + (client.stderr or "")
|
|
||||||
return self._parse_perftest_lat(out) if parse == "lat" else self._parse_perftest_bw(out)
|
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
return "timeout"
|
return {
|
||||||
except Exception as e: # noqa: BLE001
|
"test": label,
|
||||||
return f"error: {e}"
|
"status": "PASS",
|
||||||
finally:
|
"role": "server",
|
||||||
if server_proc and server_proc.poll() is None:
|
"server_timeout_sec": timeout,
|
||||||
server_proc.terminate()
|
"note": "server ran until timeout waiting for client",
|
||||||
try:
|
}
|
||||||
server_proc.wait(timeout=5)
|
except Exception as e:
|
||||||
except Exception:
|
return {"test": label, "status": "FAIL", "role": "server", "error": str(e)}
|
||||||
server_proc.kill()
|
|
||||||
# ib_write_* server normally exits after one run; pkill cleans up a
|
def _run_ibping_tests(self, active_pairs: List[tuple[str, str]]) -> List[dict]:
|
||||||
# leftover one if the client failed mid-handshake. -x matches the exact
|
tool = self._find_tool("ibping")
|
||||||
# process name so it never kills this ssh command itself.
|
if not tool:
|
||||||
try:
|
return [{"test": "ibping", "status": "FAIL", "error": "not installed"}]
|
||||||
subprocess.run(
|
if not active_pairs:
|
||||||
["ssh", "-o", "BatchMode=yes", server_target, f"pkill -x {tool}"],
|
return [{"test": "ibping", "status": "FAIL", "error": "no active IB ports"}]
|
||||||
capture_output=True, timeout=10)
|
|
||||||
except Exception:
|
dev, port = active_pairs[0]
|
||||||
pass
|
target = (
|
||||||
|
self.rdma_cfg.get("ibping_target")
|
||||||
|
or os.environ.get("IBPING_TARGET")
|
||||||
|
)
|
||||||
|
count = int(self.rdma_cfg.get("ibping_count", 5))
|
||||||
|
role = self.rdma_cfg.get("role", "auto")
|
||||||
|
server_addr = self.rdma_cfg.get("server_addr") or os.environ.get("RDMA_SERVER_ADDR")
|
||||||
|
base = [tool, "-C", dev, "-P", str(port)]
|
||||||
|
|
||||||
|
if role == "server":
|
||||||
|
return [self._run_server_mode("ibping", [*base, "-S"])]
|
||||||
|
|
||||||
|
server = None
|
||||||
|
if not target and role != "client":
|
||||||
|
target = self._read_sys(f"/sys/class/infiniband/{dev}/ports/{port}/lid")
|
||||||
|
server = subprocess.Popen([*base, "-S"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||||
|
import time
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
if not target:
|
||||||
|
reason = "no ibping target/lid"
|
||||||
|
if role == "client" or server_addr:
|
||||||
|
reason = (
|
||||||
|
"cross-node ibping requires rdma.ibping_target or IBPING_TARGET "
|
||||||
|
"(peer LID/GID; rdma.server_addr is only for perftest TCP bootstrap)"
|
||||||
|
)
|
||||||
|
return [{"test": "ibping", "status": "FAIL", "error": reason}]
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _parse_perftest_bw(output: str) -> float:
|
|
||||||
"""Parse ib_write_bw rows (#bytes #iter BW_peak BW_avg ...); return max BW avg."""
|
|
||||||
best = 0.0
|
|
||||||
for line in output.splitlines():
|
|
||||||
parts = line.split()
|
|
||||||
if len(parts) >= 4:
|
|
||||||
try:
|
try:
|
||||||
int(parts[0]) # #bytes column
|
r = subprocess.run([*base, "-c", str(count), str(target)], capture_output=True, text=True, timeout=30)
|
||||||
best = max(best, float(parts[3])) # BW average[Gb/sec]
|
if server:
|
||||||
|
server.terminate()
|
||||||
|
try:
|
||||||
|
server.wait(timeout=5)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
server.kill()
|
||||||
|
output = r.stdout + r.stderr
|
||||||
|
failed = r.returncode != 0 or "failed" in output.lower()
|
||||||
|
return [{
|
||||||
|
"test": "ibping",
|
||||||
|
"status": "FAIL" if failed else "PASS",
|
||||||
|
"role": "client" if server_addr or role == "client" else "local_loopback",
|
||||||
|
"direction": "outbound_to_peer" if server_addr or role == "client" else "local_loopback",
|
||||||
|
"target": str(target),
|
||||||
|
"count": count,
|
||||||
|
"output_tail": output[-500:],
|
||||||
|
}]
|
||||||
|
except Exception as e:
|
||||||
|
if server:
|
||||||
|
server.kill()
|
||||||
|
return [{"test": "ibping", "status": "FAIL", "error": str(e)}]
|
||||||
|
|
||||||
|
def _collect_pfc_ecn_counters(self) -> dict:
|
||||||
|
counters = {}
|
||||||
|
failed = False
|
||||||
|
keywords = ("pfc", "ecn", "cnp", "congestion")
|
||||||
|
for root, _, files in os.walk("/sys/class/infiniband"):
|
||||||
|
for name in files:
|
||||||
|
lower = name.lower()
|
||||||
|
if not any(k in lower for k in keywords):
|
||||||
|
continue
|
||||||
|
path = os.path.join(root, name)
|
||||||
|
val = self._read_sys(path)
|
||||||
|
try:
|
||||||
|
num = int(val)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
continue
|
continue
|
||||||
return round(best, 2) if best else 0.0
|
rel = path.replace("/sys/class/infiniband/", "")
|
||||||
|
counters[rel] = num
|
||||||
|
if num != 0:
|
||||||
|
failed = True
|
||||||
|
|
||||||
@staticmethod
|
ethtool = shutil.which("ethtool")
|
||||||
def _parse_perftest_lat(output: str) -> dict:
|
net_dir = "/sys/class/net"
|
||||||
"""Parse ib_write_lat row (#bytes #iter t_min t_max t_typical t_avg ... 99%)."""
|
if ethtool and os.path.isdir(net_dir):
|
||||||
for line in output.splitlines():
|
for iface in sorted(os.listdir(net_dir)):
|
||||||
parts = line.split()
|
|
||||||
if len(parts) >= 6:
|
|
||||||
try:
|
try:
|
||||||
int(parts[0]); int(parts[1])
|
r = subprocess.run(
|
||||||
typical = float(parts[4]) # t_typical[usec]
|
[ethtool, "-S", iface],
|
||||||
except ValueError:
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if r.returncode != 0:
|
||||||
|
continue
|
||||||
|
for line in r.stdout.splitlines():
|
||||||
|
if ":" not in line:
|
||||||
|
continue
|
||||||
|
key, value = line.split(":", 1)
|
||||||
|
key = key.strip()
|
||||||
|
lower = key.lower()
|
||||||
|
if not any(k in lower for k in keywords):
|
||||||
continue
|
continue
|
||||||
p99 = None
|
|
||||||
if len(parts) >= 8:
|
|
||||||
try:
|
try:
|
||||||
p99 = float(parts[7]) # 99% percentile[usec]
|
num = int(value.strip().split()[0])
|
||||||
except ValueError:
|
except (ValueError, IndexError):
|
||||||
p99 = None
|
continue
|
||||||
return {"typical": round(typical, 2), "p99": round(p99, 2) if p99 else None}
|
counters[f"net/{iface}/{key}"] = num
|
||||||
return {"typical": None, "p99": None}
|
if num != 0:
|
||||||
|
failed = True
|
||||||
|
return {"failed": failed, "counters": counters}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def print_results(results: dict, console: Console = None):
|
def print_results(results: dict, console: Console = None):
|
||||||
@ -496,28 +596,10 @@ class RDMATest:
|
|||||||
f"({lat:.2f} us, max: {t.get('max_allowed_us', 'N/A')} us)" if status != "SKIP"
|
f"({lat:.2f} us, max: {t.get('max_allowed_us', 'N/A')} us)" if status != "SKIP"
|
||||||
else f" {t['test']}: [dim]SKIPPED[/dim]")
|
else f" {t['test']}: [dim]SKIPPED[/dim]")
|
||||||
|
|
||||||
cn = results.get("cross_node")
|
ibping_tests = results.get("ibping_tests", [])
|
||||||
if cn:
|
if ibping_tests:
|
||||||
if cn.get("skipped"):
|
c.print("\n [bold]IB Ping Tests[/bold]")
|
||||||
c.print(f"\n [bold]Cross-node RDMA[/bold]: [dim]SKIPPED "
|
for t in ibping_tests:
|
||||||
f"({cn.get('reason', '')})[/dim]")
|
status = t.get("status", "FAIL")
|
||||||
else:
|
sc = "green" if status == "PASS" else "red"
|
||||||
v = cn.get("status", "?")
|
c.print(f" {t['test']}: [{sc}]{status}[/{sc}] target={t.get('target', 'N/A')}")
|
||||||
vc = "green" if v == "PASS" else ("yellow" if v == "WARN" else "red")
|
|
||||||
c.print(f"\n [bold]Cross-node RDMA[/bold] (server {cn.get('server')}) "
|
|
||||||
f"[{vc}]{v}[/{vc}] "
|
|
||||||
f"[dim]min {cn.get('min_bandwidth_gbps')} Gb/s, "
|
|
||||||
f"max {cn.get('max_latency_us')} µs[/dim]")
|
|
||||||
for e in cn.get("per_device", []):
|
|
||||||
bw = e.get("bandwidth_gbps")
|
|
||||||
lat = e.get("latency_us")
|
|
||||||
bws = e.get("bw_status", "")
|
|
||||||
lts = e.get("lat_status", "")
|
|
||||||
bc = "green" if bws == "PASS" else ("yellow" if bws == "WARN" else "red")
|
|
||||||
lc = "green" if lts == "PASS" else ("yellow" if lts == "WARN" else "red")
|
|
||||||
bw_s = f"{bw:.1f} Gb/s" if isinstance(bw, (int, float)) else str(bw)
|
|
||||||
lat_s = f"{lat:.2f} µs" if isinstance(lat, (int, float)) else str(lat)
|
|
||||||
p99 = e.get("latency_p99_us")
|
|
||||||
p99_s = f", p99 {p99:.2f}" if isinstance(p99, (int, float)) else ""
|
|
||||||
c.print(f" {e['device']}: BW [{bc}]{bw_s}[/{bc}] | "
|
|
||||||
f"lat [{lc}]{lat_s}[/{lc}]{p99_s}")
|
|
||||||
|
|||||||
@ -1,87 +0,0 @@
|
|||||||
# cuBLASLt FP8 GEMM Cross-Check Report
|
|
||||||
|
|
||||||
Date: 2026-05-24
|
|
||||||
|
|
||||||
Scope: Validate whether the single-node FP8 compute FAIL is caused by hardware/platform limits or by the original PyTorch `_scaled_mm` benchmark path.
|
|
||||||
|
|
||||||
## Method
|
|
||||||
|
|
||||||
Added a direct cuBLASLt FP8 GEMM micro-benchmark:
|
|
||||||
|
|
||||||
- Source: `scripts/cublaslt_fp8_gemm_bench.cu`
|
|
||||||
- Wrapper: `scripts/run_cublaslt_fp8_gemm.sh`
|
|
||||||
- Input dtype: `CUDA_R_8F_E4M3`
|
|
||||||
- Output dtype: `CUDA_R_16BF`
|
|
||||||
- Accumulate / compute type: `CUBLAS_COMPUTE_32F`
|
|
||||||
- Layout: cuBLASLt FP8-required TN format
|
|
||||||
- Matrix size: `8192`
|
|
||||||
- Warmup: `50`
|
|
||||||
- Iterations: `500`
|
|
||||||
- GPUs: single-node 8 GPUs, measured one GPU at a time
|
|
||||||
|
|
||||||
NVIDIA cuBLASLt documentation states FP8 kernels require TN format, `CUBLAS_COMPUTE_32F`, and `CUDA_R_32F` scale type. The implemented benchmark follows those constraints.
|
|
||||||
|
|
||||||
## Results
|
|
||||||
|
|
||||||
### aikubeworker0012 / nccl-gpu-1
|
|
||||||
|
|
||||||
Raw report: `reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json`
|
|
||||||
|
|
||||||
| GPU | FP8 TFLOPS |
|
|
||||||
|---:|---:|
|
|
||||||
| 0 | 1615.6 |
|
|
||||||
| 1 | 1611.0 |
|
|
||||||
| 2 | 1599.0 |
|
|
||||||
| 3 | 1607.1 |
|
|
||||||
| 4 | 1614.0 |
|
|
||||||
| 5 | 1604.4 |
|
|
||||||
| 6 | 1608.4 |
|
|
||||||
| 7 | 1609.1 |
|
|
||||||
|
|
||||||
Summary:
|
|
||||||
|
|
||||||
- Mean: `1608.6 TFLOPS`
|
|
||||||
- Min / Max: `1599.0 / 1615.6 TFLOPS`
|
|
||||||
- Spread: `1.03%`
|
|
||||||
- FP8 absolute threshold: `>= 1400 TFLOPS`
|
|
||||||
- Verdict against FP8 absolute threshold: **PASS**
|
|
||||||
- Verdict against 8-GPU consistency threshold `<= 3%`: **PASS**
|
|
||||||
|
|
||||||
### aikubeworker0016 / nccl-gpu-2
|
|
||||||
|
|
||||||
Raw report: `reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json`
|
|
||||||
|
|
||||||
| GPU | FP8 TFLOPS |
|
|
||||||
|---:|---:|
|
|
||||||
| 0 | 1602.3 |
|
|
||||||
| 1 | 1604.0 |
|
|
||||||
| 2 | 1616.9 |
|
|
||||||
| 3 | 1610.6 |
|
|
||||||
| 4 | 1620.5 |
|
|
||||||
| 5 | 1630.3 |
|
|
||||||
| 6 | 1605.1 |
|
|
||||||
| 7 | 1620.2 |
|
|
||||||
|
|
||||||
Summary:
|
|
||||||
|
|
||||||
- Mean: `1613.7 TFLOPS`
|
|
||||||
- Min / Max: `1602.3 / 1630.3 TFLOPS`
|
|
||||||
- Spread: `1.74%`
|
|
||||||
- FP8 absolute threshold: `>= 1400 TFLOPS`
|
|
||||||
- Verdict against FP8 absolute threshold: **PASS**
|
|
||||||
- Verdict against 8-GPU consistency threshold `<= 3%`: **PASS**
|
|
||||||
|
|
||||||
## Comparison With Existing PyTorch `_scaled_mm` Result
|
|
||||||
|
|
||||||
| Host | PyTorch `_scaled_mm` FP8 | cuBLASLt FP8 | Delta |
|
|
||||||
|---|---:|---:|---:|
|
|
||||||
| aikubeworker0012 | 1170.4 | 1608.6 | +438.2 |
|
|
||||||
| aikubeworker0016 | 1179.5 | 1613.7 | +434.2 |
|
|
||||||
|
|
||||||
The cuBLASLt path passes the `>= 1400 TFLOPS` FP8 absolute threshold on both machines, while the original PyTorch `_scaled_mm` path remains around `1170-1180 TFLOPS`.
|
|
||||||
|
|
||||||
## Conclusion
|
|
||||||
|
|
||||||
The FP8 hardware path is capable of exceeding the configured H100 FP8 acceptance threshold on both machines. The earlier FP8 FAIL is therefore most likely a benchmark implementation issue in the current PyTorch `_scaled_mm` path, not a GPU hardware, power, clock, thermal, MIG, ECC, or Fabric Manager issue.
|
|
||||||
|
|
||||||
Recommended next action: replace or augment the existing FP8 compute acceptance item with the cuBLASLt FP8 GEMM cross-check, while keeping the PyTorch `_scaled_mm` result as a secondary software-stack signal.
|
|
||||||
@ -1,21 +0,0 @@
|
|||||||
{
|
|
||||||
"source": "cuBLASLt",
|
|
||||||
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"warmup": 50,
|
|
||||||
"iterations": 500,
|
|
||||||
"per_gpu": [
|
|
||||||
{"index": 0, "fp8_tflops": 1615.6},
|
|
||||||
{"index": 1, "fp8_tflops": 1611.0},
|
|
||||||
{"index": 2, "fp8_tflops": 1599.0},
|
|
||||||
{"index": 3, "fp8_tflops": 1607.1},
|
|
||||||
{"index": 4, "fp8_tflops": 1614.0},
|
|
||||||
{"index": 5, "fp8_tflops": 1604.4},
|
|
||||||
{"index": 6, "fp8_tflops": 1608.4},
|
|
||||||
{"index": 7, "fp8_tflops": 1609.1}
|
|
||||||
],
|
|
||||||
"mean_tflops": 1608.6,
|
|
||||||
"min_tflops": 1599.0,
|
|
||||||
"max_tflops": 1615.6,
|
|
||||||
"spread_pct": 1.03
|
|
||||||
}
|
|
||||||
@ -1,21 +0,0 @@
|
|||||||
{
|
|
||||||
"source": "cuBLASLt",
|
|
||||||
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"warmup": 50,
|
|
||||||
"iterations": 500,
|
|
||||||
"per_gpu": [
|
|
||||||
{"index": 0, "fp8_tflops": 1602.3},
|
|
||||||
{"index": 1, "fp8_tflops": 1604.0},
|
|
||||||
{"index": 2, "fp8_tflops": 1616.9},
|
|
||||||
{"index": 3, "fp8_tflops": 1610.6},
|
|
||||||
{"index": 4, "fp8_tflops": 1620.5},
|
|
||||||
{"index": 5, "fp8_tflops": 1630.3},
|
|
||||||
{"index": 6, "fp8_tflops": 1605.1},
|
|
||||||
{"index": 7, "fp8_tflops": 1620.2}
|
|
||||||
],
|
|
||||||
"mean_tflops": 1613.7,
|
|
||||||
"min_tflops": 1602.3,
|
|
||||||
"max_tflops": 1630.3,
|
|
||||||
"spread_pct": 1.74
|
|
||||||
}
|
|
||||||
@ -1,169 +0,0 @@
|
|||||||
# FP8 GEMM 路径对比测试报告
|
|
||||||
|
|
||||||
测试日期:2026-05-25
|
|
||||||
测试节点:aikubeworker0012、aikubeworker0016
|
|
||||||
测试 GPU:NVIDIA H100 80GB HBM3
|
|
||||||
测试目标:对比同一 FP8 GEMM 规模下 PyTorch eager、CUDA Graph、Transformer Engine 和 direct cuBLASLt 的性能差异。
|
|
||||||
|
|
||||||
## 一、测试结论
|
|
||||||
|
|
||||||
本次 A-E 五条路径均已完成实测。
|
|
||||||
|
|
||||||
核心结论:
|
|
||||||
|
|
||||||
1. direct cuBLASLt 是本组测试里最快路径,两台机器分别达到 1626.6 TFLOPS 和 1598.1 TFLOPS。
|
|
||||||
2. PyTorch eager `_scaled_mm` 默认路径约为 1161.9-1186.1 TFLOPS。
|
|
||||||
3. 打开 `use_fast_accum=True` 后,PyTorch eager 路径有稳定提升,约提升 5.0%-6.7%。
|
|
||||||
4. CUDA Graph + `_scaled_mm(use_fast_accum=True)` 进一步提升到 1277.7-1322.2 TFLOPS,但仍低于 direct cuBLASLt。
|
|
||||||
5. Transformer Engine 本次使用的是 `te.Linear` + `fp8_autocast` 路径,不是裸 GEMM,因此包含 TE module、cast、FP8 recipe 等额外开销,结果低于 direct cuBLASLt,也低于 CUDA Graph `_scaled_mm`。
|
|
||||||
|
|
||||||
这说明:当前 GPU 硬件和 cuBLASLt 裸 GEMM 能力本身没有问题;之前 PyTorch `_scaled_mm` 1170-1180 TFLOPS 左右的结果,主要反映的是 PyTorch eager 路径和当前 benchmark 方式下的端到端路径性能,而不是 GPU 算力极限。
|
|
||||||
|
|
||||||
## 二、测试方法
|
|
||||||
|
|
||||||
统一参数:
|
|
||||||
|
|
||||||
| 参数 | 值 |
|
|
||||||
|---|---:|
|
|
||||||
| matrix_size | 8192 |
|
|
||||||
| M/N/K | 8192/8192/8192 |
|
|
||||||
| warmup | 50 |
|
|
||||||
| iterations | 500 |
|
|
||||||
| GPU index | 0 |
|
|
||||||
| PyTorch | 2.6.0+cu124 |
|
|
||||||
| CUDA | 12.4 |
|
|
||||||
| 输入 dtype | FP8 E4M3 |
|
|
||||||
| 输出 dtype | BF16 |
|
|
||||||
| accumulation | FP32 |
|
|
||||||
| scale_a / scale_b | 1.0 / 1.0 |
|
|
||||||
|
|
||||||
测试路径定义:
|
|
||||||
|
|
||||||
| 路径 | 名称 | 含义 |
|
|
||||||
|---|---|---|
|
|
||||||
| A | 当前 eager `_scaled_mm` | PyTorch 立即执行模式调用 `torch._scaled_mm`,默认 accumulation 参数 |
|
|
||||||
| B | `_scaled_mm(use_fast_accum=True)` | PyTorch eager 路径,但显式打开 fast accumulation |
|
|
||||||
| C | CUDA Graph + `_scaled_mm(use_fast_accum=True)` | 捕获并 replay 同一个 `_scaled_mm` 调用,降低 Python/PyTorch launch 间隙 |
|
|
||||||
| D | Transformer Engine FP8 GEMM | `te.Linear` 在 `fp8_autocast` 下执行,包含 TE 层封装和 FP8 recipe 开销 |
|
|
||||||
| E | direct cuBLASLt | C++/CUDA 直接调用 `cublasLtMatmul`,绕过 PyTorch eager |
|
|
||||||
|
|
||||||
复现脚本:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
MATRIX_SIZE=8192 WARMUP=50 ITERATIONS=500 GPU_INDEX=0 WORKSPACE_MB=256 \
|
|
||||||
/root/test_gpu_scripts/scripts/run_fp8_path_comparison.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
## 三、实测结果
|
|
||||||
|
|
||||||
### aikubeworker0012
|
|
||||||
|
|
||||||
原始 JSON:`/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json`
|
|
||||||
|
|
||||||
| 路径 | 状态 | TFLOPS | 单轮 CUDA event 时间 |
|
|
||||||
|---|---|---:|---:|
|
|
||||||
| A eager `_scaled_mm` default | OK | 1186.1 | 927.014 us |
|
|
||||||
| B eager `_scaled_mm` fast_accum | OK | 1266.0 | 868.481 us |
|
|
||||||
| C CUDA Graph + fast_accum | OK | 1322.2 | 831.573 us |
|
|
||||||
| D Transformer Engine FP8 Linear | OK | 1153.2 | 953.478 us |
|
|
||||||
| E direct cuBLASLt fast_accum | OK | 1626.6 | 未在 combined JSON 中记录 |
|
|
||||||
|
|
||||||
相对 A 的提升:
|
|
||||||
|
|
||||||
| 路径 | 相对 A |
|
|
||||||
|---|---:|
|
|
||||||
| B | +6.7% |
|
|
||||||
| C | +11.5% |
|
|
||||||
| D | -2.8% |
|
|
||||||
| E | +37.1% |
|
|
||||||
|
|
||||||
E 路径 cuBLASLt 算法信息:
|
|
||||||
|
|
||||||
| 字段 | 值 |
|
|
||||||
|---|---:|
|
|
||||||
| algo_id | 52 |
|
|
||||||
| tile_id | 23 |
|
|
||||||
| splitk | 1 |
|
|
||||||
| stages_id | 36 |
|
|
||||||
| inner_shape_id | 0 |
|
|
||||||
| cluster_shape_id | 3 |
|
|
||||||
|
|
||||||
### aikubeworker0016
|
|
||||||
|
|
||||||
原始 JSON:`/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json`
|
|
||||||
|
|
||||||
| 路径 | 状态 | TFLOPS | 单轮 CUDA event 时间 |
|
|
||||||
|---|---|---:|---:|
|
|
||||||
| A eager `_scaled_mm` default | OK | 1161.9 | 946.313 us |
|
|
||||||
| B eager `_scaled_mm` fast_accum | OK | 1220.4 | 900.960 us |
|
|
||||||
| C CUDA Graph + fast_accum | OK | 1277.7 | 860.543 us |
|
|
||||||
| D Transformer Engine FP8 Linear | OK | 1125.3 | 977.054 us |
|
|
||||||
| E direct cuBLASLt fast_accum | OK | 1598.1 | 未在 combined JSON 中记录 |
|
|
||||||
|
|
||||||
相对 A 的提升:
|
|
||||||
|
|
||||||
| 路径 | 相对 A |
|
|
||||||
|---|---:|
|
|
||||||
| B | +5.0% |
|
|
||||||
| C | +10.0% |
|
|
||||||
| D | -3.2% |
|
|
||||||
| E | +37.5% |
|
|
||||||
|
|
||||||
E 路径 cuBLASLt 算法信息:
|
|
||||||
|
|
||||||
| 字段 | 值 |
|
|
||||||
|---|---:|
|
|
||||||
| algo_id | 52 |
|
|
||||||
| tile_id | 23 |
|
|
||||||
| splitk | 1 |
|
|
||||||
| stages_id | 36 |
|
|
||||||
| inner_shape_id | 0 |
|
|
||||||
| cluster_shape_id | 3 |
|
|
||||||
|
|
||||||
## 四、对 PyTorch FP8 能否“上去”的判断
|
|
||||||
|
|
||||||
从本次结果看,PyTorch FP8 路径可以通过两类方式上去:
|
|
||||||
|
|
||||||
1. 打开更快的 math/accumulation 参数,例如 `use_fast_accum=True`。
|
|
||||||
2. 使用 CUDA Graph replay,减少 eager 模式下每轮调度、enqueue 之间的间隙。
|
|
||||||
|
|
||||||
但在当前 `matrix_size=8192`、单个 `_scaled_mm`、PyTorch eager/Graph benchmark 的测试形态下,PyTorch 路径仍没有达到 direct cuBLASLt 的 1598-1626 TFLOPS。也就是说,direct cuBLASLt 证明硬件和底层库有能力跑得更高;PyTorch eager `_scaled_mm` 测到的是 PyTorch 当前封装路径在这个 shape 下的实际表现。
|
|
||||||
|
|
||||||
如果把目标定义为“让 PyTorch 代码路径更接近裸 cuBLASLt”,后续可以继续验证:
|
|
||||||
|
|
||||||
1. 更大的 GEMM size,例如 16384。
|
|
||||||
2. 固定 shape 后用 `torch.compile` 或 Inductor。
|
|
||||||
3. CUDA Graph 覆盖更完整的 step,而不是只 replay 单个 op。
|
|
||||||
4. 使用 Transformer Engine 的更底层 GEMM API 或官方 microbenchmark,而不是 `te.Linear` module forward。
|
|
||||||
5. 对 `_scaled_mm` 做 Nsight Systems / Nsight Compute 抓取,确认实际 kernel、间隙和 cuBLASLt 算法选择。
|
|
||||||
|
|
||||||
## 五、术语说明
|
|
||||||
|
|
||||||
`eager` 指 PyTorch 立即执行模式。每次 Python 调用 `torch._scaled_mm`,PyTorch 都会经过 dispatcher、参数检查、Tensor 创建、准备 descriptor、调用 cuBLASLt heuristic,然后把 matmul enqueue 到 CUDA stream。
|
|
||||||
|
|
||||||
`cuBLAS` 是 NVIDIA 的基础矩阵乘库。`cuBLASLt` 是更灵活的矩阵乘接口,支持更多 layout、FP8、算法 heuristic、workspace、epilogue 等能力。
|
|
||||||
|
|
||||||
`direct cuBLASLt` 指我们自己写 C++/CUDA 直接调用 `cublasLtMatmul`,不经过 PyTorch eager,因此更接近裸 GEMM 峰值。
|
|
||||||
|
|
||||||
`CUDA Graph` 指把一次 CUDA work 提前捕获成图,后续直接 replay,减少 CPU 侧反复 launch/调度带来的间隙。
|
|
||||||
|
|
||||||
`Transformer Engine` 是 NVIDIA 面向 Transformer/FP8 训练优化的库。本次 D 路径使用的是 `te.Linear` module forward,不等同于裸 GEMM microbenchmark。
|
|
||||||
|
|
||||||
## 六、文件清单
|
|
||||||
|
|
||||||
本地脚本:
|
|
||||||
|
|
||||||
| 文件 | 用途 |
|
|
||||||
|---|---|
|
|
||||||
| `/Users/d-robotics/lab/test_gpu_scripts/scripts/pytorch_fp8_path_bench.py` | A/B/C/D PyTorch 与 Transformer Engine 路径 |
|
|
||||||
| `/Users/d-robotics/lab/test_gpu_scripts/scripts/cublaslt_fp8_gemm_bench.cu` | E direct cuBLASLt 路径 |
|
|
||||||
| `/Users/d-robotics/lab/test_gpu_scripts/scripts/run_fp8_path_comparison.sh` | 统一运行并合并 A-E 结果 |
|
|
||||||
|
|
||||||
本地结果:
|
|
||||||
|
|
||||||
| 文件 | 用途 |
|
|
||||||
|---|---|
|
|
||||||
| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json` | aikubeworker0012 A-E 原始结果 |
|
|
||||||
| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json` | aikubeworker0016 A-E 原始结果 |
|
|
||||||
| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_path_comparison_20260525.md` | 本中文汇总报告 |
|
|
||||||
|
|
||||||
@ -1,142 +0,0 @@
|
|||||||
{
|
|
||||||
"source": "fp8_path_comparison",
|
|
||||||
"host": null,
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"gpu_index": 0,
|
|
||||||
"pytorch": {
|
|
||||||
"source": "pytorch_fp8_path_bench",
|
|
||||||
"torch": "2.6.0+cu124",
|
|
||||||
"cuda": "12.4",
|
|
||||||
"gpu_index": 0,
|
|
||||||
"gpu_name": "NVIDIA H100 80GB HBM3",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"warmup": 50,
|
|
||||||
"iterations": 500,
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"name": "A_eager_scaled_mm_default",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 465.145,
|
|
||||||
"event_us_per_iter": 930.29,
|
|
||||||
"wall_ms_total": 465.21,
|
|
||||||
"tflops": 1181.9
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "B_eager_scaled_mm_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 440.252,
|
|
||||||
"event_us_per_iter": 880.504,
|
|
||||||
"wall_ms_total": 440.289,
|
|
||||||
"tflops": 1248.7
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 3,
|
|
||||||
"event_ms_total": 415.631,
|
|
||||||
"event_us_per_iter": 831.262,
|
|
||||||
"wall_ms_total": 415.664,
|
|
||||||
"tflops": 1322.7
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "D_transformer_engine_fp8_linear",
|
|
||||||
"status": "unavailable",
|
|
||||||
"reason": "ModuleNotFoundError: No module named 'transformer_engine'"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"summary": {
|
|
||||||
"max_tflops": 1322.7,
|
|
||||||
"min_tflops": 1181.9,
|
|
||||||
"mean_tflops": 1251.1
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"cublaslt": {
|
|
||||||
"source": "cuBLASLt",
|
|
||||||
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"warmup": 50,
|
|
||||||
"iterations": 500,
|
|
||||||
"fast_accum": 1,
|
|
||||||
"per_gpu": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"fp8_tflops": 1615.4,
|
|
||||||
"algo_id": 52,
|
|
||||||
"tile_id": 23,
|
|
||||||
"splitk": 1,
|
|
||||||
"stages_id": 36,
|
|
||||||
"inner_shape_id": 0,
|
|
||||||
"cluster_shape_id": 3
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"mean_tflops": 1615.4,
|
|
||||||
"min_tflops": 1615.4,
|
|
||||||
"max_tflops": 1615.4,
|
|
||||||
"spread_pct": 0.0
|
|
||||||
},
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"name": "A_eager_scaled_mm_default",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 465.145,
|
|
||||||
"event_us_per_iter": 930.29,
|
|
||||||
"wall_ms_total": 465.21,
|
|
||||||
"tflops": 1181.9
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "B_eager_scaled_mm_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 440.252,
|
|
||||||
"event_us_per_iter": 880.504,
|
|
||||||
"wall_ms_total": 440.289,
|
|
||||||
"tflops": 1248.7
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 3,
|
|
||||||
"event_ms_total": 415.631,
|
|
||||||
"event_us_per_iter": 831.262,
|
|
||||||
"wall_ms_total": 415.664,
|
|
||||||
"tflops": 1322.7
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "D_transformer_engine_fp8_linear",
|
|
||||||
"status": "unavailable",
|
|
||||||
"reason": "ModuleNotFoundError: No module named 'transformer_engine'"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"algo_id": 52,
|
|
||||||
"tile_id": 23,
|
|
||||||
"splitk": 1,
|
|
||||||
"stages_id": 36,
|
|
||||||
"inner_shape_id": 0,
|
|
||||||
"cluster_shape_id": 3,
|
|
||||||
"name": "E_direct_cublaslt_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"tflops": 1615.4,
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"fast_accum": 1,
|
|
||||||
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
@ -1,156 +0,0 @@
|
|||||||
{
|
|
||||||
"source": "fp8_path_comparison",
|
|
||||||
"host": null,
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"gpu_index": 0,
|
|
||||||
"pytorch": {
|
|
||||||
"source": "pytorch_fp8_path_bench",
|
|
||||||
"torch": "2.6.0+cu124",
|
|
||||||
"cuda": "12.4",
|
|
||||||
"gpu_index": 0,
|
|
||||||
"gpu_name": "NVIDIA H100 80GB HBM3",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"warmup": 50,
|
|
||||||
"iterations": 500,
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"name": "A_eager_scaled_mm_default",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 463.507,
|
|
||||||
"event_us_per_iter": 927.014,
|
|
||||||
"wall_ms_total": 463.573,
|
|
||||||
"tflops": 1186.1
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "B_eager_scaled_mm_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 434.241,
|
|
||||||
"event_us_per_iter": 868.481,
|
|
||||||
"wall_ms_total": 434.492,
|
|
||||||
"tflops": 1266.0
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 3,
|
|
||||||
"event_ms_total": 415.786,
|
|
||||||
"event_us_per_iter": 831.573,
|
|
||||||
"wall_ms_total": 415.825,
|
|
||||||
"tflops": 1322.2
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "D_transformer_engine_fp8_linear",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 476.739,
|
|
||||||
"event_us_per_iter": 953.478,
|
|
||||||
"wall_ms_total": 476.8,
|
|
||||||
"tflops": 1153.2,
|
|
||||||
"note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"summary": {
|
|
||||||
"max_tflops": 1322.2,
|
|
||||||
"min_tflops": 1153.2,
|
|
||||||
"mean_tflops": 1231.9
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"cublaslt": {
|
|
||||||
"source": "cuBLASLt",
|
|
||||||
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"warmup": 50,
|
|
||||||
"iterations": 500,
|
|
||||||
"fast_accum": 1,
|
|
||||||
"per_gpu": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"fp8_tflops": 1626.6,
|
|
||||||
"algo_id": 52,
|
|
||||||
"tile_id": 23,
|
|
||||||
"splitk": 1,
|
|
||||||
"stages_id": 36,
|
|
||||||
"inner_shape_id": 0,
|
|
||||||
"cluster_shape_id": 3
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"mean_tflops": 1626.6,
|
|
||||||
"min_tflops": 1626.6,
|
|
||||||
"max_tflops": 1626.6,
|
|
||||||
"spread_pct": 0.0
|
|
||||||
},
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"name": "A_eager_scaled_mm_default",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 463.507,
|
|
||||||
"event_us_per_iter": 927.014,
|
|
||||||
"wall_ms_total": 463.573,
|
|
||||||
"tflops": 1186.1
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "B_eager_scaled_mm_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 434.241,
|
|
||||||
"event_us_per_iter": 868.481,
|
|
||||||
"wall_ms_total": 434.492,
|
|
||||||
"tflops": 1266.0
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 3,
|
|
||||||
"event_ms_total": 415.786,
|
|
||||||
"event_us_per_iter": 831.573,
|
|
||||||
"wall_ms_total": 415.825,
|
|
||||||
"tflops": 1322.2
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "D_transformer_engine_fp8_linear",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 476.739,
|
|
||||||
"event_us_per_iter": 953.478,
|
|
||||||
"wall_ms_total": 476.8,
|
|
||||||
"tflops": 1153.2,
|
|
||||||
"note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"algo_id": 52,
|
|
||||||
"tile_id": 23,
|
|
||||||
"splitk": 1,
|
|
||||||
"stages_id": 36,
|
|
||||||
"inner_shape_id": 0,
|
|
||||||
"cluster_shape_id": 3,
|
|
||||||
"name": "E_direct_cublaslt_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"tflops": 1626.6,
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"fast_accum": 1,
|
|
||||||
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
@ -1,142 +0,0 @@
|
|||||||
{
|
|
||||||
"source": "fp8_path_comparison",
|
|
||||||
"host": null,
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"gpu_index": 0,
|
|
||||||
"pytorch": {
|
|
||||||
"source": "pytorch_fp8_path_bench",
|
|
||||||
"torch": "2.6.0+cu124",
|
|
||||||
"cuda": "12.4",
|
|
||||||
"gpu_index": 0,
|
|
||||||
"gpu_name": "NVIDIA H100 80GB HBM3",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"warmup": 50,
|
|
||||||
"iterations": 500,
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"name": "A_eager_scaled_mm_default",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 470.909,
|
|
||||||
"event_us_per_iter": 941.817,
|
|
||||||
"wall_ms_total": 470.974,
|
|
||||||
"tflops": 1167.4
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "B_eager_scaled_mm_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 452.608,
|
|
||||||
"event_us_per_iter": 905.215,
|
|
||||||
"wall_ms_total": 452.647,
|
|
||||||
"tflops": 1214.6
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 3,
|
|
||||||
"event_ms_total": 427.724,
|
|
||||||
"event_us_per_iter": 855.449,
|
|
||||||
"wall_ms_total": 427.768,
|
|
||||||
"tflops": 1285.3
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "D_transformer_engine_fp8_linear",
|
|
||||||
"status": "unavailable",
|
|
||||||
"reason": "ModuleNotFoundError: No module named 'transformer_engine'"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"summary": {
|
|
||||||
"max_tflops": 1285.3,
|
|
||||||
"min_tflops": 1167.4,
|
|
||||||
"mean_tflops": 1222.4
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"cublaslt": {
|
|
||||||
"source": "cuBLASLt",
|
|
||||||
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"warmup": 50,
|
|
||||||
"iterations": 500,
|
|
||||||
"fast_accum": 1,
|
|
||||||
"per_gpu": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"fp8_tflops": 1594.3,
|
|
||||||
"algo_id": 52,
|
|
||||||
"tile_id": 23,
|
|
||||||
"splitk": 1,
|
|
||||||
"stages_id": 36,
|
|
||||||
"inner_shape_id": 0,
|
|
||||||
"cluster_shape_id": 3
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"mean_tflops": 1594.3,
|
|
||||||
"min_tflops": 1594.3,
|
|
||||||
"max_tflops": 1594.3,
|
|
||||||
"spread_pct": 0.0
|
|
||||||
},
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"name": "A_eager_scaled_mm_default",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 470.909,
|
|
||||||
"event_us_per_iter": 941.817,
|
|
||||||
"wall_ms_total": 470.974,
|
|
||||||
"tflops": 1167.4
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "B_eager_scaled_mm_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 452.608,
|
|
||||||
"event_us_per_iter": 905.215,
|
|
||||||
"wall_ms_total": 452.647,
|
|
||||||
"tflops": 1214.6
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 3,
|
|
||||||
"event_ms_total": 427.724,
|
|
||||||
"event_us_per_iter": 855.449,
|
|
||||||
"wall_ms_total": 427.768,
|
|
||||||
"tflops": 1285.3
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "D_transformer_engine_fp8_linear",
|
|
||||||
"status": "unavailable",
|
|
||||||
"reason": "ModuleNotFoundError: No module named 'transformer_engine'"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"algo_id": 52,
|
|
||||||
"tile_id": 23,
|
|
||||||
"splitk": 1,
|
|
||||||
"stages_id": 36,
|
|
||||||
"inner_shape_id": 0,
|
|
||||||
"cluster_shape_id": 3,
|
|
||||||
"name": "E_direct_cublaslt_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"tflops": 1594.3,
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"fast_accum": 1,
|
|
||||||
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
@ -1,156 +0,0 @@
|
|||||||
{
|
|
||||||
"source": "fp8_path_comparison",
|
|
||||||
"host": null,
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"gpu_index": 0,
|
|
||||||
"pytorch": {
|
|
||||||
"source": "pytorch_fp8_path_bench",
|
|
||||||
"torch": "2.6.0+cu124",
|
|
||||||
"cuda": "12.4",
|
|
||||||
"gpu_index": 0,
|
|
||||||
"gpu_name": "NVIDIA H100 80GB HBM3",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"warmup": 50,
|
|
||||||
"iterations": 500,
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"name": "A_eager_scaled_mm_default",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 473.156,
|
|
||||||
"event_us_per_iter": 946.313,
|
|
||||||
"wall_ms_total": 473.199,
|
|
||||||
"tflops": 1161.9
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "B_eager_scaled_mm_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 450.48,
|
|
||||||
"event_us_per_iter": 900.96,
|
|
||||||
"wall_ms_total": 450.505,
|
|
||||||
"tflops": 1220.4
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 3,
|
|
||||||
"event_ms_total": 430.272,
|
|
||||||
"event_us_per_iter": 860.543,
|
|
||||||
"wall_ms_total": 430.304,
|
|
||||||
"tflops": 1277.7
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "D_transformer_engine_fp8_linear",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 488.527,
|
|
||||||
"event_us_per_iter": 977.054,
|
|
||||||
"wall_ms_total": 488.576,
|
|
||||||
"tflops": 1125.3,
|
|
||||||
"note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"summary": {
|
|
||||||
"max_tflops": 1277.7,
|
|
||||||
"min_tflops": 1125.3,
|
|
||||||
"mean_tflops": 1196.3
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"cublaslt": {
|
|
||||||
"source": "cuBLASLt",
|
|
||||||
"dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"warmup": 50,
|
|
||||||
"iterations": 500,
|
|
||||||
"fast_accum": 1,
|
|
||||||
"per_gpu": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"fp8_tflops": 1598.1,
|
|
||||||
"algo_id": 52,
|
|
||||||
"tile_id": 23,
|
|
||||||
"splitk": 1,
|
|
||||||
"stages_id": 36,
|
|
||||||
"inner_shape_id": 0,
|
|
||||||
"cluster_shape_id": 3
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"mean_tflops": 1598.1,
|
|
||||||
"min_tflops": 1598.1,
|
|
||||||
"max_tflops": 1598.1,
|
|
||||||
"spread_pct": 0.0
|
|
||||||
},
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"name": "A_eager_scaled_mm_default",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 473.156,
|
|
||||||
"event_us_per_iter": 946.313,
|
|
||||||
"wall_ms_total": 473.199,
|
|
||||||
"tflops": 1161.9
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "B_eager_scaled_mm_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 450.48,
|
|
||||||
"event_us_per_iter": 900.96,
|
|
||||||
"wall_ms_total": 450.505,
|
|
||||||
"tflops": 1220.4
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 3,
|
|
||||||
"event_ms_total": 430.272,
|
|
||||||
"event_us_per_iter": 860.543,
|
|
||||||
"wall_ms_total": 430.304,
|
|
||||||
"tflops": 1277.7
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "D_transformer_engine_fp8_linear",
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"event_ms_total": 488.527,
|
|
||||||
"event_us_per_iter": 977.054,
|
|
||||||
"wall_ms_total": 488.576,
|
|
||||||
"tflops": 1125.3,
|
|
||||||
"note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"algo_id": 52,
|
|
||||||
"tile_id": 23,
|
|
||||||
"splitk": 1,
|
|
||||||
"stages_id": 36,
|
|
||||||
"inner_shape_id": 0,
|
|
||||||
"cluster_shape_id": 3,
|
|
||||||
"name": "E_direct_cublaslt_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"tflops": 1598.1,
|
|
||||||
"matrix_size": 8192,
|
|
||||||
"iterations": 500,
|
|
||||||
"warmup": 50,
|
|
||||||
"fast_accum": 1,
|
|
||||||
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
@ -1,152 +0,0 @@
|
|||||||
# GPU_Test 合并报告
|
|
||||||
|
|
||||||
- **日期:** 2026-05-24
|
|
||||||
- **节点:** `aikubeworker0012 / 172.72.8.12`,`aikubeworker0016 / 172.72.8.16`
|
|
||||||
- **GPU:** NVIDIA H100 80GB HBM3 x8 / node
|
|
||||||
- **范围:** 单机单卡算力与多机多卡 NCCL 通信
|
|
||||||
- **说明:** 本报告汇总既有原始测试结果,不重新启动额外压力测试。
|
|
||||||
|
|
||||||
## 总体结论
|
|
||||||
|
|
||||||
| 测试项 | 结论 | 说明 |
|
|
||||||
|---|---|---|
|
|
||||||
| 单机 GPU 识别 | PASS | 两台机器均识别 8 张 H100 80GB HBM3 |
|
|
||||||
| 单机单卡 FP8 硬件算力 | PASS | direct cuBLASLt FP8 GEMM 两台机器均超过 `>= 1400 TFLOPS` |
|
|
||||||
| PyTorch `_scaled_mm` FP8 路径 | FAIL / 软件栈信号 | 约 `1170-1180 TFLOPS`,低于阈值;已定位为 PyTorch eager / `_scaled_mm` benchmark 路径偏低,不作为硬件失败依据 |
|
|
||||||
| 多机多卡 NCCL 正确性 | PASS | return code `0`,`Wrong=0` / `Out of bounds values: 0 OK` |
|
|
||||||
| 多机多卡 NCCL 性能 | 符合当前 4x400Gbps 网络形态 | 2x8 allreduce / alltoall 低于 PDF 8x400Gbps 阈值,但该阈值不应直接硬套到当前 4x400Gbps 环境 |
|
|
||||||
|
|
||||||
## 单机单卡 / 算力测试
|
|
||||||
|
|
||||||
### 机器信息
|
|
||||||
|
|
||||||
| Host | GPU | Driver | CUDA | GPU 数量 |
|
|
||||||
|---|---|---|---|---:|
|
|
||||||
| `aikubeworker0012` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
|
|
||||||
| `aikubeworker0016` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
|
|
||||||
|
|
||||||
来源:
|
|
||||||
|
|
||||||
- `reports_single_gpu_aikubeworker0012.md`
|
|
||||||
- `reports_single_gpu_aikubeworker0016.md`
|
|
||||||
|
|
||||||
### 原始 PyTorch 单机算力结果
|
|
||||||
|
|
||||||
| Host | FP32 | TF32 | FP16 | BF16 | FP8 `_scaled_mm` | 原始 Verdict |
|
|
||||||
|---|---:|---:|---:|---:|---:|---|
|
|
||||||
| `aikubeworker0012` | 52.0 | 362.3 | 691.0 | 713.0 | 1148.8 | FAIL |
|
|
||||||
| `aikubeworker0016` | 51.9 | 357.8 | 667.2 | 699.1 | 1146.2 | FAIL |
|
|
||||||
|
|
||||||
原始 PyTorch 路径使用 `torch._scaled_mm` 做 FP8 GEMM。后续复查显示,该路径会受到 PyTorch eager dispatch、输出 Tensor 创建、cuBLASLt heuristic 路径、默认 `use_fast_accum=False` 等因素影响,不能直接代表 H100 FP8 Tensor Core 硬件上限。
|
|
||||||
|
|
||||||
### direct cuBLASLt FP8 GEMM 交叉验证
|
|
||||||
|
|
||||||
测试参数:
|
|
||||||
|
|
||||||
| 参数 | 值 |
|
|
||||||
|---|---|
|
|
||||||
| Benchmark | direct cuBLASLt FP8 GEMM |
|
|
||||||
| Source | `scripts/cublaslt_fp8_gemm_bench.cu` |
|
|
||||||
| Matrix | `8192 x 8192 x 8192` |
|
|
||||||
| A/B dtype | FP8 E4M3 |
|
|
||||||
| Output dtype | BF16 |
|
|
||||||
| Compute type | `CUBLAS_COMPUTE_32F` |
|
|
||||||
| Scale type | `CUDA_R_32F` |
|
|
||||||
| Scale A/B | `1.0` |
|
|
||||||
| Layout | TN |
|
|
||||||
| fast accumulation | enabled |
|
|
||||||
| Threshold | `>= 1400 TFLOPS` |
|
|
||||||
|
|
||||||
结果:
|
|
||||||
|
|
||||||
| Host | Mean FP8 TFLOPS | Min | Max | Spread | Threshold | Verdict |
|
|
||||||
|---|---:|---:|---:|---:|---:|---|
|
|
||||||
| `aikubeworker0012` | 1608.6 | 1599.0 | 1615.6 | 1.03% | >= 1400 | PASS |
|
|
||||||
| `aikubeworker0016` | 1613.7 | 1602.3 | 1630.3 | 1.74% | >= 1400 | PASS |
|
|
||||||
|
|
||||||
单卡逐张结果:
|
|
||||||
|
|
||||||
| Host | GPU0 | GPU1 | GPU2 | GPU3 | GPU4 | GPU5 | GPU6 | GPU7 |
|
|
||||||
|---|---:|---:|---:|---:|---:|---:|---:|---:|
|
|
||||||
| `aikubeworker0012` | 1615.6 | 1611.0 | 1599.0 | 1607.1 | 1614.0 | 1604.4 | 1608.4 | 1609.1 |
|
|
||||||
| `aikubeworker0016` | 1602.3 | 1604.0 | 1616.9 | 1610.6 | 1620.5 | 1630.3 | 1605.1 | 1620.2 |
|
|
||||||
|
|
||||||
结论:direct cuBLASLt FP8 GEMM 已通过 `>= 1400 TFLOPS` 阈值,说明两台机器的 FP8 硬件计算路径具备达标能力。PyTorch `_scaled_mm` 的 FAIL 更适合作为软件栈 benchmark 路径问题记录,而不是 GPU 硬件失败结论。
|
|
||||||
|
|
||||||
来源:
|
|
||||||
|
|
||||||
- `reports_cublaslt_fp8_crosscheck_20260524.md`
|
|
||||||
- `reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json`
|
|
||||||
- `reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json`
|
|
||||||
|
|
||||||
## 多机多卡 NCCL 测试
|
|
||||||
|
|
||||||
### 测试环境
|
|
||||||
|
|
||||||
| 项目 | 结果 |
|
|
||||||
|---|---|
|
|
||||||
| Hosts | `nccl-gpu-1(172.72.8.12)`,`nccl-gpu-2(172.72.8.16)` |
|
|
||||||
| Topology | 2 nodes x 8 GPUs,合计 16 GPUs |
|
|
||||||
| NCCL source | `nccl-tests-mpirun` |
|
|
||||||
| NCCL network | IB |
|
|
||||||
| GPU Direct RDMA | ENABLED |
|
|
||||||
| Active HCA rails | `mlx5_0, mlx5_1, mlx5_6, mlx5_7` |
|
|
||||||
| HCA speed | 4 条 `400 Gb/sec (4X NDR)` ACTIVE |
|
|
||||||
|
|
||||||
注意:NCCL 表里的 `GB/s` 是大 B,即 Bytes/s。IB 网卡口径 `400 Gb/s` 是小 b,即 bits/s。
|
|
||||||
|
|
||||||
### 2x8 全集合通信结果
|
|
||||||
|
|
||||||
| Operation | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Correctness | 当前 4x400Gbps 口径 |
|
|
||||||
|---|---:|---:|---:|---|---|
|
|
||||||
| allreduce | 354.27 GB/s | 354.45 GB/s | >= 491.84 GB/s | PASS | 符合当前硬件形态,低于 PDF 8 rail 阈值 |
|
|
||||||
| alltoall | 37.00 GB/s | 37.14 GB/s | >= 76.54 GB/s | PASS | 符合当前硬件形态,低于 PDF 8 rail 阈值 |
|
|
||||||
| broadcast | 191.65 GB/s | 190.25 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
|
|
||||||
| reducescatter | 192.75 GB/s | 192.74 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
|
|
||||||
| allgather | 192.14 GB/s | 192.47 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
|
|
||||||
| sendrecv | 26.98 GB/s | 26.97 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
|
|
||||||
|
|
||||||
结论:2x8 全集合通信测试中,NCCL 正确性通过。allreduce 和 alltoall 低于 PDF 8x400Gbps 参考阈值,但当前机器确认参与 NCCL 的是 4 条 400Gbps rail,因此该差距不应直接判定为当前 4x400Gbps 环境不合格。
|
|
||||||
|
|
||||||
来源:
|
|
||||||
|
|
||||||
- `reports_multinode_nccl_all_collectives_20260523_120144.md`
|
|
||||||
- `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md`
|
|
||||||
|
|
||||||
### PDF Matrix allreduce / alltoall 结果
|
|
||||||
|
|
||||||
AllReduce(PDF 8x400Gbps 阈值对比,仅作参考):
|
|
||||||
|
|
||||||
| Topology | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Gap | 当前解释 |
|
|
||||||
|---|---:|---:|---:|---:|---|
|
|
||||||
| 2 nodes x 1 GPU | 47.29 GB/s | 47.26 GB/s | >= 48.90 GB/s | -1.61 GB/s | 接近 PDF 阈值 |
|
|
||||||
| 2 nodes x 2 GPUs | 137.16 GB/s | 137.13 GB/s | >= 136.93 GB/s | +0.23 GB/s | 达到 PDF 阈值 |
|
|
||||||
| 2 nodes x 4 GPUs | 335.07 GB/s | 335.02 GB/s | >= 335.48 GB/s | -0.41 GB/s | 接近 PDF 阈值 |
|
|
||||||
| 2 nodes x 8 GPUs | 353.85 GB/s | 353.85 GB/s | >= 491.84 GB/s | -137.99 GB/s | 低于 PDF 8 rail 阈值;当前为 4 rail 环境,不直接判不合格 |
|
|
||||||
|
|
||||||
AllToAll(PDF 8x400Gbps 阈值对比,仅作参考):
|
|
||||||
|
|
||||||
| Topology | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Gap | 当前解释 |
|
|
||||||
|---|---:|---:|---:|---:|---|
|
|
||||||
| 2 nodes x 1 GPU | 24.85 GB/s | 24.90 GB/s | >= 27.25 GB/s | -2.40 GB/s | 接近 PDF 阈值 |
|
|
||||||
| 2 nodes x 2 GPUs | 47.76 GB/s | 47.98 GB/s | >= 54.41 GB/s | -6.65 GB/s | 低于 PDF 8 rail 阈值 |
|
|
||||||
| 2 nodes x 4 GPUs | 72.74 GB/s | 72.80 GB/s | >= 73.73 GB/s | -0.99 GB/s | 接近 PDF 阈值 |
|
|
||||||
| 2 nodes x 8 GPUs | 36.83 GB/s | 36.85 GB/s | >= 76.54 GB/s | -39.71 GB/s | 低于 PDF 8 rail 阈值;当前为 4 rail 环境,不直接判不合格 |
|
|
||||||
|
|
||||||
来源:
|
|
||||||
|
|
||||||
- `reports_multinode_nccl_pdf_matrix_run_20260523.md`
|
|
||||||
- `reports_multinode_nccl_pdf_matrix_20260523_113803.md`
|
|
||||||
|
|
||||||
## 风险与判断
|
|
||||||
|
|
||||||
1. 单机 FP8 硬件能力通过 direct cuBLASLt 验证,当前不支持将 PyTorch `_scaled_mm` FAIL 直接判定为 GPU 硬件故障。
|
|
||||||
2. 多机 NCCL 正确性通过,性能结果应按当前 4x400Gbps rail 环境解释。
|
|
||||||
3. 当前多机环境确认参与 NCCL 的是 4 条 400G IB rail;PDF 参考环境为 8x400G 计算管理网络,因此 2x8 阈值与当前硬件形态不等价。
|
|
||||||
4. 2x8 allreduce 和 alltoall 低于 PDF 8 rail 阈值,建议作为“与 PDF 参考环境差异”记录,而不是作为当前 4 rail 环境不合格结论。
|
|
||||||
|
|
||||||
## 建议
|
|
||||||
|
|
||||||
1. 单机 FP8 验收以 direct cuBLASLt 或 Transformer Engine GEMM benchmark 为主,PyTorch `_scaled_mm` 作为软件栈参考项保留。
|
|
||||||
2. 多机 NCCL 后续若要按 PDF 阈值验收,需要先对齐 PDF 参考环境的 8x400Gbps rail 数量、NCCL net plugin / SHARP、跨 Leaf 交换策略、ECMP / 拥塞控制配置。
|
|
||||||
3. 对外报告建议明确区分 `GB/s` 与 `Gb/s`:NCCL bus bandwidth 是大 B,IB 端口速率是小 b。
|
|
||||||
@ -1,123 +0,0 @@
|
|||||||
# GPU_Test 双节点测试报告
|
|
||||||
|
|
||||||
- **测试日期:** 2026-05-24
|
|
||||||
- **测试节点:** `aikubeworker0012 / 172.72.8.12`,`aikubeworker0016 / 172.72.8.16`
|
|
||||||
- **节点配置:** 每节点 8 张 NVIDIA H100 80GB HBM3 GPU
|
|
||||||
- **测试范围:** 单机算力、单机 8 卡通信、多机 2x8 GPU 通信
|
|
||||||
- **网络形态:** 当前参与 NCCL 的计算网络为 4 条 400Gbps IB rail
|
|
||||||
|
|
||||||
## 结论摘要
|
|
||||||
|
|
||||||
| 项目 | 结果摘要 |
|
|
||||||
|---|---|
|
|
||||||
| GPU 识别 | 两台节点均识别 8 张 H100 80GB HBM3 GPU |
|
|
||||||
| 单机 FP8 GEMM | 两台节点 direct cuBLASLt FP8 GEMM 均超过 1600 TFLOPS |
|
|
||||||
| 单机 8 卡 NCCL | 两台节点单机 8 卡 NCCL 集合通信均可正常完成,主要大包通信带宽稳定 |
|
|
||||||
| 多机 2x8 NCCL | 两节点 16 GPU NCCL 正确性通过,所有测试 `Wrong=0` / return code `0` |
|
|
||||||
| 多机网络口径 | 当前为 4x400Gbps IB rail 环境,结果按该硬件形态解释 |
|
|
||||||
|
|
||||||
## 测试环境
|
|
||||||
|
|
||||||
| Host | GPU | Driver | CUDA | GPU 数量 |
|
|
||||||
|---|---|---|---|---:|
|
|
||||||
| `aikubeworker0012` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
|
|
||||||
| `aikubeworker0016` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
|
|
||||||
|
|
||||||
## 单机算力测试
|
|
||||||
|
|
||||||
### FP8 GEMM 硬件路径验证
|
|
||||||
|
|
||||||
本项使用 direct cuBLASLt FP8 GEMM benchmark,绕过 PyTorch eager 调度路径,直接验证 GPU FP8 Tensor Core 与 cuBLASLt GEMM 能力。
|
|
||||||
|
|
||||||
| 参数 | 配置 |
|
|
||||||
|---|---|
|
|
||||||
| GEMM shape | `8192 x 8192 x 8192` |
|
|
||||||
| 输入类型 | FP8 E4M3 |
|
|
||||||
| 输出类型 | BF16 |
|
|
||||||
| 累加类型 | FP32 compute |
|
|
||||||
| Layout | TN |
|
|
||||||
| Scale | `scale_a = 1.0`,`scale_b = 1.0` |
|
|
||||||
| fast accumulation | enabled |
|
|
||||||
| 测试 GPU | 每节点 8 张 GPU 逐张测试 |
|
|
||||||
|
|
||||||
| Host | Mean FP8 TFLOPS | Min | Max | Spread |
|
|
||||||
|---|---:|---:|---:|---:|
|
|
||||||
| `aikubeworker0012` | 1608.6 | 1599.0 | 1615.6 | 1.03% |
|
|
||||||
| `aikubeworker0016` | 1613.7 | 1602.3 | 1630.3 | 1.74% |
|
|
||||||
|
|
||||||
| Host | GPU0 | GPU1 | GPU2 | GPU3 | GPU4 | GPU5 | GPU6 | GPU7 |
|
|
||||||
|---|---:|---:|---:|---:|---:|---:|---:|---:|
|
|
||||||
| `aikubeworker0012` | 1615.6 | 1611.0 | 1599.0 | 1607.1 | 1614.0 | 1604.4 | 1608.4 | 1609.1 |
|
|
||||||
| `aikubeworker0016` | 1602.3 | 1604.0 | 1616.9 | 1610.6 | 1620.5 | 1630.3 | 1605.1 | 1620.2 |
|
|
||||||
|
|
||||||
**说明:** PyTorch `_scaled_mm` eager benchmark 结果约为 1170-1180 TFLOPS,该结果反映 PyTorch 软件路径与调度开销,不作为本报告的硬件算力结论。
|
|
||||||
|
|
||||||
## 单机 8 卡 NCCL 通信测试
|
|
||||||
|
|
||||||
本项在单个节点内使用 8 张 GPU 进行 NCCL 集合通信测试,结果单位为 `GB/s`,即 Bytes/s。
|
|
||||||
|
|
||||||
| Operation | `aikubeworker0012` Bus BW | `aikubeworker0016` Bus BW |
|
|
||||||
|---|---:|---:|
|
|
||||||
| allreduce | 472.3 GB/s | 472.4 GB/s |
|
|
||||||
| alltoall | 343.3 GB/s | 344.3 GB/s |
|
|
||||||
| broadcast | 364.1 GB/s | 363.6 GB/s |
|
|
||||||
| reducescatter | 352.8 GB/s | 353.1 GB/s |
|
|
||||||
| allgather | 366.4 GB/s | 366.4 GB/s |
|
|
||||||
| sendrecv | 369.0 GB/s | 368.9 GB/s |
|
|
||||||
|
|
||||||
**说明:** 单机 8 卡通信主要依赖节点内 GPU 互联与 NCCL collective 实现。两台节点的同类 operation 结果接近,节点间差异较小。
|
|
||||||
|
|
||||||
## 多机 2x8 NCCL 通信测试
|
|
||||||
|
|
||||||
本项使用两台节点,每台 8 张 GPU,共 16 张 GPU 进行跨节点 NCCL 集合通信测试。
|
|
||||||
|
|
||||||
### 网络环境
|
|
||||||
|
|
||||||
| 项目 | 配置 |
|
|
||||||
|---|---|
|
|
||||||
| Host A | `aikubeworker0012 / 172.72.8.12` |
|
|
||||||
| Host B | `aikubeworker0016 / 172.72.8.16` |
|
|
||||||
| 拓扑 | 2 nodes x 8 GPUs |
|
|
||||||
| NCCL network | IB |
|
|
||||||
| GPU Direct RDMA | ENABLED |
|
|
||||||
| Active rails | `mlx5_0, mlx5_1, mlx5_6, mlx5_7` |
|
|
||||||
| Rail 速率 | 4 条 `400 Gb/sec (4X NDR)` ACTIVE |
|
|
||||||
|
|
||||||
### 跨节点 NCCL 结果
|
|
||||||
|
|
||||||
| Operation | Peak Bus BW | Avg Bus BW | Correctness |
|
|
||||||
|---|---:|---:|---|
|
|
||||||
| allreduce | 354.27 GB/s | 354.45 GB/s | PASS |
|
|
||||||
| alltoall | 37.00 GB/s | 37.14 GB/s | PASS |
|
|
||||||
| broadcast | 191.65 GB/s | 190.25 GB/s | PASS |
|
|
||||||
| reducescatter | 192.75 GB/s | 192.74 GB/s | PASS |
|
|
||||||
| allgather | 192.14 GB/s | 192.47 GB/s | PASS |
|
|
||||||
| sendrecv | 26.98 GB/s | 26.97 GB/s | PASS |
|
|
||||||
|
|
||||||
**正确性:** 本轮多机 NCCL 测试 return code 为 `0`,`Wrong=0`,未发现数据正确性错误。
|
|
||||||
|
|
||||||
## 单位说明
|
|
||||||
|
|
||||||
| 写法 | 含义 | 说明 |
|
|
||||||
|---|---|---|
|
|
||||||
| `GB/s` | Gigabytes per second | 大 B,字节每秒,NCCL bus bandwidth 使用此单位 |
|
|
||||||
| `Gbps` / `Gb/s` | Gigabits per second | 小 b,比特每秒,IB 端口速率通常使用此单位 |
|
|
||||||
|
|
||||||
换算关系:
|
|
||||||
|
|
||||||
```text
|
|
||||||
1 Byte = 8 bits
|
|
||||||
400 Gb/s = 50 GB/s
|
|
||||||
4 x 400 Gb/s = 1600 Gb/s = 200 GB/s 物理链路字节带宽
|
|
||||||
```
|
|
||||||
|
|
||||||
NCCL 的 `busbw` 是 collective 通信的逻辑折算带宽,不等同于单条物理链路的线速。
|
|
||||||
|
|
||||||
## 结果说明
|
|
||||||
|
|
||||||
1. 两台节点 GPU 识别正常,均为 8 张 H100 80GB HBM3。
|
|
||||||
2. direct cuBLASLt FP8 GEMM 显示两台节点单卡 FP8 算力均超过 1600 TFLOPS,GPU FP8 硬件计算路径正常。
|
|
||||||
3. 单机 8 卡 NCCL 通信在两台节点上结果接近,未观察到明显节点间异常差异。
|
|
||||||
4. 多机 2x8 NCCL 正确性通过,跨节点通信功能正常。
|
|
||||||
5. 当前多机通信结果应按 4x400Gbps IB rail 环境解释;若后续需要对齐 8x400Gbps 环境,应先确认 rail 数量、NCCL net plugin / SHARP、交换网络策略等配置一致。
|
|
||||||
|
|
||||||
@ -1,102 +0,0 @@
|
|||||||
@page {
|
|
||||||
size: A4 landscape;
|
|
||||||
margin: 13mm;
|
|
||||||
}
|
|
||||||
|
|
||||||
body {
|
|
||||||
color: #111827;
|
|
||||||
font-family: "PingFang SC", "Heiti SC", "Arial Unicode MS", sans-serif;
|
|
||||||
font-size: 11px;
|
|
||||||
line-height: 1.45;
|
|
||||||
}
|
|
||||||
|
|
||||||
h1 {
|
|
||||||
color: #0f172a;
|
|
||||||
font-size: 24px;
|
|
||||||
margin: 0 0 14px;
|
|
||||||
}
|
|
||||||
|
|
||||||
h2 {
|
|
||||||
border-bottom: 1px solid #cbd5e1;
|
|
||||||
color: #0f172a;
|
|
||||||
font-size: 17px;
|
|
||||||
margin: 24px 0 10px;
|
|
||||||
padding-bottom: 4px;
|
|
||||||
}
|
|
||||||
|
|
||||||
h3 {
|
|
||||||
color: #1f2937;
|
|
||||||
font-size: 13px;
|
|
||||||
margin: 16px 0 8px;
|
|
||||||
}
|
|
||||||
|
|
||||||
p {
|
|
||||||
margin: 7px 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
code {
|
|
||||||
background: #f1f5f9;
|
|
||||||
border-radius: 3px;
|
|
||||||
color: #0f172a;
|
|
||||||
font-family: Menlo, Consolas, monospace;
|
|
||||||
font-size: 10px;
|
|
||||||
padding: 1px 3px;
|
|
||||||
}
|
|
||||||
|
|
||||||
pre {
|
|
||||||
background: #f8fafc;
|
|
||||||
border: 1px solid #e2e8f0;
|
|
||||||
border-radius: 4px;
|
|
||||||
padding: 8px;
|
|
||||||
white-space: pre-wrap;
|
|
||||||
}
|
|
||||||
|
|
||||||
table {
|
|
||||||
border-collapse: collapse;
|
|
||||||
margin: 8px 0 14px;
|
|
||||||
page-break-inside: auto;
|
|
||||||
width: 100%;
|
|
||||||
}
|
|
||||||
|
|
||||||
thead {
|
|
||||||
display: table-header-group;
|
|
||||||
}
|
|
||||||
|
|
||||||
tr {
|
|
||||||
page-break-inside: avoid;
|
|
||||||
}
|
|
||||||
|
|
||||||
th,
|
|
||||||
td {
|
|
||||||
border: 1px solid #cbd5e1;
|
|
||||||
padding: 5px 6px;
|
|
||||||
text-align: left;
|
|
||||||
vertical-align: middle;
|
|
||||||
word-break: break-word;
|
|
||||||
}
|
|
||||||
|
|
||||||
th {
|
|
||||||
background: #e2e8f0;
|
|
||||||
color: #0f172a;
|
|
||||||
font-weight: 700;
|
|
||||||
}
|
|
||||||
|
|
||||||
tbody tr:nth-child(even) td {
|
|
||||||
background: #f8fafc;
|
|
||||||
}
|
|
||||||
|
|
||||||
a {
|
|
||||||
color: #2563eb;
|
|
||||||
text-decoration: none;
|
|
||||||
}
|
|
||||||
|
|
||||||
ul,
|
|
||||||
ol {
|
|
||||||
margin: 6px 0 10px 20px;
|
|
||||||
padding: 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
li {
|
|
||||||
margin: 3px 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
@ -1,291 +0,0 @@
|
|||||||
#include <cublasLt.h>
|
|
||||||
#include <cuda_bf16.h>
|
|
||||||
#include <cuda_fp8.h>
|
|
||||||
#include <cuda_runtime.h>
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <cstring>
|
|
||||||
#include <numeric>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#define CHECK_CUDA(call) \
|
|
||||||
do { \
|
|
||||||
cudaError_t status = (call); \
|
|
||||||
if (status != cudaSuccess) { \
|
|
||||||
std::fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \
|
|
||||||
cudaGetErrorString(status)); \
|
|
||||||
std::exit(1); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
#define CHECK_CUBLAS(call) \
|
|
||||||
do { \
|
|
||||||
cublasStatus_t status = (call); \
|
|
||||||
if (status != CUBLAS_STATUS_SUCCESS) { \
|
|
||||||
std::fprintf(stderr, "cuBLASLt error %s:%d: status=%d\n", __FILE__, \
|
|
||||||
__LINE__, static_cast<int>(status)); \
|
|
||||||
std::exit(1); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
__global__ void fill_fp8(__nv_fp8_e4m3 *ptr, size_t count, float value) {
|
|
||||||
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
||||||
size_t stride = blockDim.x * gridDim.x;
|
|
||||||
for (size_t i = tid; i < count; i += stride) {
|
|
||||||
ptr[i] = __nv_fp8_e4m3(value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Args {
|
|
||||||
int matrix_size = 8192;
|
|
||||||
int warmup = 20;
|
|
||||||
int iterations = 200;
|
|
||||||
int first_gpu = 0;
|
|
||||||
int gpu_count = -1;
|
|
||||||
size_t workspace_mb = 256;
|
|
||||||
int fast_accum = 1;
|
|
||||||
};
|
|
||||||
|
|
||||||
static Args parse_args(int argc, char **argv) {
|
|
||||||
Args args;
|
|
||||||
for (int i = 1; i < argc; ++i) {
|
|
||||||
auto need = [&](const char *name) {
|
|
||||||
if (i + 1 >= argc) {
|
|
||||||
std::fprintf(stderr, "Missing value for %s\n", name);
|
|
||||||
std::exit(2);
|
|
||||||
}
|
|
||||||
return argv[++i];
|
|
||||||
};
|
|
||||||
if (!std::strcmp(argv[i], "--matrix-size")) {
|
|
||||||
args.matrix_size = std::atoi(need(argv[i]));
|
|
||||||
} else if (!std::strcmp(argv[i], "--warmup")) {
|
|
||||||
args.warmup = std::atoi(need(argv[i]));
|
|
||||||
} else if (!std::strcmp(argv[i], "--iterations")) {
|
|
||||||
args.iterations = std::atoi(need(argv[i]));
|
|
||||||
} else if (!std::strcmp(argv[i], "--first-gpu")) {
|
|
||||||
args.first_gpu = std::atoi(need(argv[i]));
|
|
||||||
} else if (!std::strcmp(argv[i], "--gpu-count")) {
|
|
||||||
args.gpu_count = std::atoi(need(argv[i]));
|
|
||||||
} else if (!std::strcmp(argv[i], "--workspace-mb")) {
|
|
||||||
args.workspace_mb = static_cast<size_t>(std::atoll(need(argv[i])));
|
|
||||||
} else if (!std::strcmp(argv[i], "--fast-accum")) {
|
|
||||||
args.fast_accum = std::atoi(need(argv[i]));
|
|
||||||
} else if (!std::strcmp(argv[i], "--help") || !std::strcmp(argv[i], "-h")) {
|
|
||||||
std::puts("Usage: cublaslt_fp8_gemm_bench [--matrix-size N] [--warmup N] "
|
|
||||||
"[--iterations N] [--first-gpu N] [--gpu-count N] "
|
|
||||||
"[--workspace-mb N] [--fast-accum 0|1]");
|
|
||||||
std::exit(0);
|
|
||||||
} else {
|
|
||||||
std::fprintf(stderr, "Unknown argument: %s\n", argv[i]);
|
|
||||||
std::exit(2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return args;
|
|
||||||
}
|
|
||||||
|
|
||||||
static double run_one_gpu(int gpu, const Args &args) {
|
|
||||||
CHECK_CUDA(cudaSetDevice(gpu));
|
|
||||||
|
|
||||||
const int64_t m = args.matrix_size;
|
|
||||||
const int64_t n = args.matrix_size;
|
|
||||||
const int64_t k = args.matrix_size;
|
|
||||||
const size_t a_elems = static_cast<size_t>(m) * k;
|
|
||||||
const size_t b_elems = static_cast<size_t>(k) * n;
|
|
||||||
const size_t d_elems = static_cast<size_t>(m) * n;
|
|
||||||
|
|
||||||
__nv_fp8_e4m3 *d_a = nullptr;
|
|
||||||
__nv_fp8_e4m3 *d_b = nullptr;
|
|
||||||
__nv_bfloat16 *d_d = nullptr;
|
|
||||||
void *workspace = nullptr;
|
|
||||||
float *d_scale_a = nullptr;
|
|
||||||
float *d_scale_b = nullptr;
|
|
||||||
const float scale = 1.0f;
|
|
||||||
const size_t workspace_bytes = args.workspace_mb * 1024ULL * 1024ULL;
|
|
||||||
|
|
||||||
CHECK_CUDA(cudaMalloc(&d_a, a_elems * sizeof(__nv_fp8_e4m3)));
|
|
||||||
CHECK_CUDA(cudaMalloc(&d_b, b_elems * sizeof(__nv_fp8_e4m3)));
|
|
||||||
CHECK_CUDA(cudaMalloc(&d_d, d_elems * sizeof(__nv_bfloat16)));
|
|
||||||
CHECK_CUDA(cudaMalloc(&workspace, workspace_bytes));
|
|
||||||
CHECK_CUDA(cudaMalloc(&d_scale_a, sizeof(float)));
|
|
||||||
CHECK_CUDA(cudaMalloc(&d_scale_b, sizeof(float)));
|
|
||||||
CHECK_CUDA(cudaMemcpy(d_scale_a, &scale, sizeof(scale), cudaMemcpyHostToDevice));
|
|
||||||
CHECK_CUDA(cudaMemcpy(d_scale_b, &scale, sizeof(scale), cudaMemcpyHostToDevice));
|
|
||||||
|
|
||||||
const int threads = 256;
|
|
||||||
const int blocks = 4096;
|
|
||||||
fill_fp8<<<blocks, threads>>>(d_a, a_elems, 0.01f);
|
|
||||||
fill_fp8<<<blocks, threads>>>(d_b, b_elems, 0.01f);
|
|
||||||
CHECK_CUDA(cudaMemset(d_d, 0, d_elems * sizeof(__nv_bfloat16)));
|
|
||||||
CHECK_CUDA(cudaGetLastError());
|
|
||||||
CHECK_CUDA(cudaDeviceSynchronize());
|
|
||||||
|
|
||||||
cublasLtHandle_t lt;
|
|
||||||
cublasLtMatmulDesc_t op_desc;
|
|
||||||
cublasLtMatrixLayout_t a_desc, b_desc, d_desc;
|
|
||||||
cublasLtMatmulPreference_t preference;
|
|
||||||
CHECK_CUBLAS(cublasLtCreate(<));
|
|
||||||
CHECK_CUBLAS(cublasLtMatmulDescCreate(&op_desc, CUBLAS_COMPUTE_32F, CUDA_R_32F));
|
|
||||||
|
|
||||||
// cuBLASLt FP8 kernels require TN format: A is transposed, B is non-transposed.
|
|
||||||
// With square GEMMs this keeps the benchmark FLOP count identical to the PDF
|
|
||||||
// acceptance shape while satisfying the library's FP8 kernel constraints.
|
|
||||||
cublasOperation_t transa = CUBLAS_OP_T;
|
|
||||||
cublasOperation_t transb = CUBLAS_OP_N;
|
|
||||||
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
|
||||||
op_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)));
|
|
||||||
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
|
||||||
op_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb)));
|
|
||||||
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
|
||||||
op_desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &d_scale_a,
|
|
||||||
sizeof(d_scale_a)));
|
|
||||||
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
|
||||||
op_desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &d_scale_b,
|
|
||||||
sizeof(d_scale_b)));
|
|
||||||
int8_t fast_accum = args.fast_accum ? 1 : 0;
|
|
||||||
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
|
||||||
op_desc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fast_accum,
|
|
||||||
sizeof(fast_accum)));
|
|
||||||
|
|
||||||
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&a_desc, CUDA_R_8F_E4M3, k, m, k));
|
|
||||||
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&b_desc, CUDA_R_8F_E4M3, k, n, k));
|
|
||||||
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&d_desc, CUDA_R_16BF, m, n, m));
|
|
||||||
|
|
||||||
CHECK_CUBLAS(cublasLtMatmulPreferenceCreate(&preference));
|
|
||||||
CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute(
|
|
||||||
preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_bytes,
|
|
||||||
sizeof(workspace_bytes)));
|
|
||||||
|
|
||||||
cublasLtMatmulHeuristicResult_t heuristic;
|
|
||||||
int returned = 0;
|
|
||||||
CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic(
|
|
||||||
lt, op_desc, a_desc, b_desc, d_desc, d_desc, preference, 1, &heuristic,
|
|
||||||
&returned));
|
|
||||||
if (returned == 0) {
|
|
||||||
std::fprintf(stderr, "No cuBLASLt heuristic returned for GPU %d\n", gpu);
|
|
||||||
std::exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto get_algo_attr_i32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
|
|
||||||
int32_t value = -1;
|
|
||||||
size_t written = 0;
|
|
||||||
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
|
|
||||||
&heuristic.algo, attr, &value, sizeof(value), &written));
|
|
||||||
return static_cast<int>(value);
|
|
||||||
};
|
|
||||||
auto get_algo_attr_u32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
|
|
||||||
uint32_t value = 0;
|
|
||||||
size_t written = 0;
|
|
||||||
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
|
|
||||||
&heuristic.algo, attr, &value, sizeof(value), &written));
|
|
||||||
return static_cast<int>(value);
|
|
||||||
};
|
|
||||||
auto get_algo_attr_u16 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
|
|
||||||
uint16_t value = 0;
|
|
||||||
size_t written = 0;
|
|
||||||
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
|
|
||||||
&heuristic.algo, attr, &value, sizeof(value), &written));
|
|
||||||
return static_cast<int>(value);
|
|
||||||
};
|
|
||||||
const int algo_id = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_ID);
|
|
||||||
const int tile_id = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_TILE_ID);
|
|
||||||
const int splitk = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_SPLITK_NUM);
|
|
||||||
const int stages = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_STAGES_ID);
|
|
||||||
const int inner_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID);
|
|
||||||
const int cluster_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID);
|
|
||||||
|
|
||||||
const float alpha = 1.0f;
|
|
||||||
const float beta = 0.0f;
|
|
||||||
auto matmul = [&]() {
|
|
||||||
CHECK_CUBLAS(cublasLtMatmul(lt, op_desc, &alpha, d_a, a_desc, d_b, b_desc,
|
|
||||||
&beta, d_d, d_desc, d_d, d_desc,
|
|
||||||
&heuristic.algo, workspace, workspace_bytes, 0));
|
|
||||||
};
|
|
||||||
|
|
||||||
for (int i = 0; i < args.warmup; ++i) {
|
|
||||||
matmul();
|
|
||||||
}
|
|
||||||
CHECK_CUDA(cudaDeviceSynchronize());
|
|
||||||
|
|
||||||
cudaEvent_t start, stop;
|
|
||||||
CHECK_CUDA(cudaEventCreate(&start));
|
|
||||||
CHECK_CUDA(cudaEventCreate(&stop));
|
|
||||||
CHECK_CUDA(cudaEventRecord(start));
|
|
||||||
for (int i = 0; i < args.iterations; ++i) {
|
|
||||||
matmul();
|
|
||||||
}
|
|
||||||
CHECK_CUDA(cudaEventRecord(stop));
|
|
||||||
CHECK_CUDA(cudaEventSynchronize(stop));
|
|
||||||
float elapsed_ms = 0.0f;
|
|
||||||
CHECK_CUDA(cudaEventElapsedTime(&elapsed_ms, start, stop));
|
|
||||||
const double flops =
|
|
||||||
2.0 * static_cast<double>(m) * static_cast<double>(n) *
|
|
||||||
static_cast<double>(k) * static_cast<double>(args.iterations);
|
|
||||||
const double tflops = flops / (static_cast<double>(elapsed_ms) / 1000.0) / 1e12;
|
|
||||||
std::printf(
|
|
||||||
" {\"index\": %d, \"fp8_tflops\": %.1f, \"algo_id\": %d, "
|
|
||||||
"\"tile_id\": %d, \"splitk\": %d, \"stages_id\": %d, "
|
|
||||||
"\"inner_shape_id\": %d, \"cluster_shape_id\": %d}%s\n",
|
|
||||||
gpu, tflops, algo_id, tile_id, splitk, stages, inner_shape, cluster_shape,
|
|
||||||
(gpu + 1 == args.first_gpu + args.gpu_count) ? "" : ",");
|
|
||||||
std::fflush(stdout);
|
|
||||||
|
|
||||||
CHECK_CUDA(cudaEventDestroy(start));
|
|
||||||
CHECK_CUDA(cudaEventDestroy(stop));
|
|
||||||
CHECK_CUBLAS(cublasLtMatmulPreferenceDestroy(preference));
|
|
||||||
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(a_desc));
|
|
||||||
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(b_desc));
|
|
||||||
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(d_desc));
|
|
||||||
CHECK_CUBLAS(cublasLtMatmulDescDestroy(op_desc));
|
|
||||||
CHECK_CUBLAS(cublasLtDestroy(lt));
|
|
||||||
CHECK_CUDA(cudaFree(d_a));
|
|
||||||
CHECK_CUDA(cudaFree(d_b));
|
|
||||||
CHECK_CUDA(cudaFree(d_d));
|
|
||||||
CHECK_CUDA(cudaFree(workspace));
|
|
||||||
CHECK_CUDA(cudaFree(d_scale_a));
|
|
||||||
CHECK_CUDA(cudaFree(d_scale_b));
|
|
||||||
CHECK_CUDA(cudaDeviceSynchronize());
|
|
||||||
|
|
||||||
return tflops;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
|
||||||
Args args = parse_args(argc, argv);
|
|
||||||
int device_count = 0;
|
|
||||||
CHECK_CUDA(cudaGetDeviceCount(&device_count));
|
|
||||||
if (args.gpu_count < 0) {
|
|
||||||
args.gpu_count = device_count - args.first_gpu;
|
|
||||||
}
|
|
||||||
if (args.first_gpu < 0 || args.first_gpu + args.gpu_count > device_count) {
|
|
||||||
std::fprintf(stderr, "Invalid GPU range first=%d count=%d device_count=%d\n",
|
|
||||||
args.first_gpu, args.gpu_count, device_count);
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<double> values;
|
|
||||||
std::printf("{\n");
|
|
||||||
std::printf(" \"source\": \"cuBLASLt\",\n");
|
|
||||||
std::printf(" \"dtype\": \"fp8_e4m3_inputs_bf16_output_fp32_accum\",\n");
|
|
||||||
std::printf(" \"matrix_size\": %d,\n", args.matrix_size);
|
|
||||||
std::printf(" \"warmup\": %d,\n", args.warmup);
|
|
||||||
std::printf(" \"iterations\": %d,\n", args.iterations);
|
|
||||||
std::printf(" \"fast_accum\": %d,\n", args.fast_accum ? 1 : 0);
|
|
||||||
std::printf(" \"per_gpu\": [\n");
|
|
||||||
for (int i = 0; i < args.gpu_count; ++i) {
|
|
||||||
int gpu = args.first_gpu + i;
|
|
||||||
double tflops = run_one_gpu(gpu, args);
|
|
||||||
values.push_back(tflops);
|
|
||||||
}
|
|
||||||
double mean = std::accumulate(values.begin(), values.end(), 0.0) / values.size();
|
|
||||||
auto minmax = std::minmax_element(values.begin(), values.end());
|
|
||||||
double spread = ((*minmax.second - *minmax.first) / mean) * 100.0;
|
|
||||||
std::printf(" ],\n");
|
|
||||||
std::printf(" \"mean_tflops\": %.1f,\n", mean);
|
|
||||||
std::printf(" \"min_tflops\": %.1f,\n", *minmax.first);
|
|
||||||
std::printf(" \"max_tflops\": %.1f,\n", *minmax.second);
|
|
||||||
std::printf(" \"spread_pct\": %.2f\n", spread);
|
|
||||||
std::printf("}\n");
|
|
||||||
return mean >= 1400.0 ? 0 : 1;
|
|
||||||
}
|
|
||||||
@ -1,277 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Compare FP8 GEMM paths used for H100/H200 acceptance debugging.
|
|
||||||
|
|
||||||
Paths:
|
|
||||||
A. torch._scaled_mm eager, default accumulation
|
|
||||||
B. torch._scaled_mm eager, use_fast_accum=True
|
|
||||||
C. CUDA Graph replay of torch._scaled_mm(out=..., use_fast_accum=True)
|
|
||||||
D. Transformer Engine Linear under fp8_autocast, when installed
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import statistics
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
from typing import Any, Callable
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
|
|
||||||
def tflops_from_ms(matrix_size: int, iterations: int, elapsed_ms: float) -> float:
|
|
||||||
flops = 2.0 * matrix_size * matrix_size * matrix_size * iterations
|
|
||||||
return flops / (elapsed_ms / 1000.0) / 1e12
|
|
||||||
|
|
||||||
|
|
||||||
def cuda_event_bench(
|
|
||||||
name: str,
|
|
||||||
matrix_size: int,
|
|
||||||
iterations: int,
|
|
||||||
warmup: int,
|
|
||||||
func: Callable[[int], Any],
|
|
||||||
) -> dict[str, Any]:
|
|
||||||
for i in range(warmup):
|
|
||||||
func(i)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
start = torch.cuda.Event(enable_timing=True)
|
|
||||||
end = torch.cuda.Event(enable_timing=True)
|
|
||||||
wall_start = time.perf_counter()
|
|
||||||
start.record()
|
|
||||||
for i in range(iterations):
|
|
||||||
func(i)
|
|
||||||
end.record()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
wall_elapsed = time.perf_counter() - wall_start
|
|
||||||
elapsed_ms = start.elapsed_time(end)
|
|
||||||
return {
|
|
||||||
"name": name,
|
|
||||||
"status": "ok",
|
|
||||||
"matrix_size": matrix_size,
|
|
||||||
"iterations": iterations,
|
|
||||||
"warmup": warmup,
|
|
||||||
"event_ms_total": round(elapsed_ms, 3),
|
|
||||||
"event_us_per_iter": round(elapsed_ms * 1000.0 / iterations, 3),
|
|
||||||
"wall_ms_total": round(wall_elapsed * 1000.0, 3),
|
|
||||||
"tflops": round(tflops_from_ms(matrix_size, iterations, elapsed_ms), 1),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def make_fp8_inputs(matrix_size: int, pools: int, device: str) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
|
|
||||||
a = [
|
|
||||||
torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn)
|
|
||||||
for _ in range(pools)
|
|
||||||
]
|
|
||||||
b = [
|
|
||||||
torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn)
|
|
||||||
for _ in range(pools)
|
|
||||||
]
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
return a, b
|
|
||||||
|
|
||||||
|
|
||||||
def bench_scaled_mm(args: argparse.Namespace) -> list[dict[str, Any]]:
|
|
||||||
device = f"cuda:{args.gpu_index}"
|
|
||||||
torch.cuda.set_device(args.gpu_index)
|
|
||||||
scale_a = torch.tensor(1.0, device=device)
|
|
||||||
scale_b = torch.tensor(1.0, device=device)
|
|
||||||
pools_a, pools_b = make_fp8_inputs(args.matrix_size, args.pools, device)
|
|
||||||
results: list[dict[str, Any]] = []
|
|
||||||
|
|
||||||
def eager_default(i: int) -> torch.Tensor:
|
|
||||||
idx = i % args.pools
|
|
||||||
return torch._scaled_mm(
|
|
||||||
pools_a[idx],
|
|
||||||
pools_b[idx].T,
|
|
||||||
scale_a=scale_a,
|
|
||||||
scale_b=scale_b,
|
|
||||||
out_dtype=torch.bfloat16,
|
|
||||||
)
|
|
||||||
|
|
||||||
def eager_fast(i: int) -> torch.Tensor:
|
|
||||||
idx = i % args.pools
|
|
||||||
return torch._scaled_mm(
|
|
||||||
pools_a[idx],
|
|
||||||
pools_b[idx].T,
|
|
||||||
scale_a=scale_a,
|
|
||||||
scale_b=scale_b,
|
|
||||||
out_dtype=torch.bfloat16,
|
|
||||||
use_fast_accum=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
results.append(
|
|
||||||
cuda_event_bench(
|
|
||||||
"A_eager_scaled_mm_default",
|
|
||||||
args.matrix_size,
|
|
||||||
args.iterations,
|
|
||||||
args.warmup,
|
|
||||||
eager_default,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
results.append(
|
|
||||||
cuda_event_bench(
|
|
||||||
"B_eager_scaled_mm_fast_accum",
|
|
||||||
args.matrix_size,
|
|
||||||
args.iterations,
|
|
||||||
args.warmup,
|
|
||||||
eager_fast,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
graph_out = torch.empty(
|
|
||||||
(args.matrix_size, args.matrix_size),
|
|
||||||
device=device,
|
|
||||||
dtype=torch.bfloat16,
|
|
||||||
)
|
|
||||||
static_a = pools_a[0]
|
|
||||||
static_b_t = pools_b[0].T
|
|
||||||
|
|
||||||
try:
|
|
||||||
side_stream = torch.cuda.Stream()
|
|
||||||
side_stream.wait_stream(torch.cuda.current_stream())
|
|
||||||
with torch.cuda.stream(side_stream):
|
|
||||||
for _ in range(max(3, args.warmup // 2)):
|
|
||||||
torch._scaled_mm(
|
|
||||||
static_a,
|
|
||||||
static_b_t,
|
|
||||||
scale_a=scale_a,
|
|
||||||
scale_b=scale_b,
|
|
||||||
out_dtype=torch.bfloat16,
|
|
||||||
use_fast_accum=True,
|
|
||||||
out=graph_out,
|
|
||||||
)
|
|
||||||
torch.cuda.current_stream().wait_stream(side_stream)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
graph = torch.cuda.CUDAGraph()
|
|
||||||
with torch.cuda.graph(graph):
|
|
||||||
torch._scaled_mm(
|
|
||||||
static_a,
|
|
||||||
static_b_t,
|
|
||||||
scale_a=scale_a,
|
|
||||||
scale_b=scale_b,
|
|
||||||
out_dtype=torch.bfloat16,
|
|
||||||
use_fast_accum=True,
|
|
||||||
out=graph_out,
|
|
||||||
)
|
|
||||||
|
|
||||||
def graph_replay(_: int) -> None:
|
|
||||||
graph.replay()
|
|
||||||
|
|
||||||
results.append(
|
|
||||||
cuda_event_bench(
|
|
||||||
"C_cuda_graph_scaled_mm_fast_accum",
|
|
||||||
args.matrix_size,
|
|
||||||
args.iterations,
|
|
||||||
3,
|
|
||||||
graph_replay,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
except Exception as exc: # noqa: BLE001
|
|
||||||
results.append(
|
|
||||||
{
|
|
||||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
|
||||||
"status": "unavailable",
|
|
||||||
"reason": f"{type(exc).__name__}: {exc}",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def bench_transformer_engine(args: argparse.Namespace) -> dict[str, Any]:
|
|
||||||
try:
|
|
||||||
import transformer_engine.pytorch as te # type: ignore[import-not-found]
|
|
||||||
from transformer_engine.common.recipe import DelayedScaling, Format # type: ignore[import-not-found]
|
|
||||||
except Exception as exc: # noqa: BLE001
|
|
||||||
return {
|
|
||||||
"name": "D_transformer_engine_fp8_linear",
|
|
||||||
"status": "unavailable",
|
|
||||||
"reason": f"{type(exc).__name__}: {exc}",
|
|
||||||
}
|
|
||||||
|
|
||||||
device = f"cuda:{args.gpu_index}"
|
|
||||||
x = torch.randn(args.matrix_size, args.matrix_size, device=device, dtype=torch.bfloat16)
|
|
||||||
layer = te.Linear(
|
|
||||||
args.matrix_size,
|
|
||||||
args.matrix_size,
|
|
||||||
bias=False,
|
|
||||||
params_dtype=torch.bfloat16,
|
|
||||||
device=device,
|
|
||||||
)
|
|
||||||
recipe = DelayedScaling(fp8_format=Format.HYBRID)
|
|
||||||
|
|
||||||
def run(_: int) -> torch.Tensor:
|
|
||||||
with te.fp8_autocast(enabled=True, fp8_recipe=recipe):
|
|
||||||
return layer(x)
|
|
||||||
|
|
||||||
try:
|
|
||||||
result = cuda_event_bench(
|
|
||||||
"D_transformer_engine_fp8_linear",
|
|
||||||
args.matrix_size,
|
|
||||||
args.iterations,
|
|
||||||
args.warmup,
|
|
||||||
run,
|
|
||||||
)
|
|
||||||
except Exception as exc: # noqa: BLE001
|
|
||||||
return {
|
|
||||||
"name": "D_transformer_engine_fp8_linear",
|
|
||||||
"status": "error",
|
|
||||||
"reason": f"{type(exc).__name__}: {exc}",
|
|
||||||
}
|
|
||||||
result["note"] = "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> int:
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("--matrix-size", type=int, default=8192)
|
|
||||||
parser.add_argument("--warmup", type=int, default=20)
|
|
||||||
parser.add_argument("--iterations", type=int, default=100)
|
|
||||||
parser.add_argument("--gpu-index", type=int, default=0)
|
|
||||||
parser.add_argument("--pools", type=int, default=4)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
if not torch.cuda.is_available():
|
|
||||||
print(json.dumps({"error": "cuda unavailable"}, indent=2))
|
|
||||||
return 1
|
|
||||||
if not hasattr(torch, "_scaled_mm") or not hasattr(torch, "float8_e4m3fn"):
|
|
||||||
print(json.dumps({"error": "torch FP8 _scaled_mm unavailable"}, indent=2))
|
|
||||||
return 1
|
|
||||||
|
|
||||||
torch.cuda.set_device(args.gpu_index)
|
|
||||||
props = torch.cuda.get_device_properties(args.gpu_index)
|
|
||||||
payload = {
|
|
||||||
"source": "pytorch_fp8_path_bench",
|
|
||||||
"torch": torch.__version__,
|
|
||||||
"cuda": torch.version.cuda,
|
|
||||||
"gpu_index": args.gpu_index,
|
|
||||||
"gpu_name": props.name,
|
|
||||||
"matrix_size": args.matrix_size,
|
|
||||||
"warmup": args.warmup,
|
|
||||||
"iterations": args.iterations,
|
|
||||||
"results": [],
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
payload["results"].extend(bench_scaled_mm(args))
|
|
||||||
payload["results"].append(bench_transformer_engine(args))
|
|
||||||
except torch.cuda.OutOfMemoryError as exc:
|
|
||||||
payload["error"] = f"CUDA OOM: {exc}"
|
|
||||||
print(json.dumps(payload, indent=2))
|
|
||||||
return 1
|
|
||||||
|
|
||||||
ok_values = [r["tflops"] for r in payload["results"] if r.get("status") == "ok"]
|
|
||||||
if ok_values:
|
|
||||||
payload["summary"] = {
|
|
||||||
"max_tflops": round(max(ok_values), 1),
|
|
||||||
"min_tflops": round(min(ok_values), 1),
|
|
||||||
"mean_tflops": round(statistics.mean(ok_values), 1),
|
|
||||||
}
|
|
||||||
print(json.dumps(payload, indent=2))
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(main())
|
|
||||||
@ -1,45 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -uo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
|
||||||
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
|
|
||||||
|
|
||||||
CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
|
|
||||||
NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
|
|
||||||
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
|
|
||||||
MATRIX_SIZE="${MATRIX_SIZE:-8192}"
|
|
||||||
WARMUP="${WARMUP:-20}"
|
|
||||||
ITERATIONS="${ITERATIONS:-200}"
|
|
||||||
GPU_COUNT="${GPU_COUNT:-8}"
|
|
||||||
FIRST_GPU="${FIRST_GPU:-0}"
|
|
||||||
WORKSPACE_MB="${WORKSPACE_MB:-256}"
|
|
||||||
|
|
||||||
if [[ ! -x "$NVCC" ]]; then
|
|
||||||
echo "nvcc not found: $NVCC" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
mkdir -p "$OUT_DIR" "$PROJECT_DIR/build"
|
|
||||||
HOST="$(hostname 2>/dev/null || echo unknown)"
|
|
||||||
TS="$(date +%Y%m%d_%H%M%S)"
|
|
||||||
BIN="$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
|
|
||||||
REPORT="$OUT_DIR/cublaslt_fp8_gemm_${HOST}_${TS}.json"
|
|
||||||
|
|
||||||
"$NVCC" -O3 -std=c++17 -arch=sm_90 \
|
|
||||||
"$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
|
|
||||||
-lcublasLt -lcublas -o "$BIN"
|
|
||||||
|
|
||||||
set +e
|
|
||||||
"$BIN" \
|
|
||||||
--matrix-size "$MATRIX_SIZE" \
|
|
||||||
--warmup "$WARMUP" \
|
|
||||||
--iterations "$ITERATIONS" \
|
|
||||||
--first-gpu "$FIRST_GPU" \
|
|
||||||
--gpu-count "$GPU_COUNT" \
|
|
||||||
--workspace-mb "$WORKSPACE_MB" \
|
|
||||||
| tee "$REPORT"
|
|
||||||
status=${PIPESTATUS[0]}
|
|
||||||
set -e
|
|
||||||
|
|
||||||
echo "Report written to: $REPORT"
|
|
||||||
exit "$status"
|
|
||||||
@ -1,93 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
|
||||||
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
|
|
||||||
|
|
||||||
PYTHON="${PYTHON:-/root/gpu-test-venv/bin/python}"
|
|
||||||
CUDA_HOME="${CUDA_HOME:-/usr/local/cuda-12.4}"
|
|
||||||
NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
|
|
||||||
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
|
|
||||||
MATRIX_SIZE="${MATRIX_SIZE:-8192}"
|
|
||||||
WARMUP="${WARMUP:-20}"
|
|
||||||
ITERATIONS="${ITERATIONS:-100}"
|
|
||||||
GPU_INDEX="${GPU_INDEX:-0}"
|
|
||||||
WORKSPACE_MB="${WORKSPACE_MB:-256}"
|
|
||||||
VENV_SITE_PACKAGES="$("$PYTHON" - <<'PY'
|
|
||||||
import site
|
|
||||||
print(site.getsitepackages()[0])
|
|
||||||
PY
|
|
||||||
)"
|
|
||||||
export LD_LIBRARY_PATH="$VENV_SITE_PACKAGES/nvidia/cudnn/lib:$VENV_SITE_PACKAGES/nvidia/nccl/lib:${LD_LIBRARY_PATH:-}"
|
|
||||||
|
|
||||||
mkdir -p "$PROJECT_DIR/build" "$OUT_DIR"
|
|
||||||
|
|
||||||
HOST="$(hostname 2>/dev/null || echo unknown)"
|
|
||||||
TS="$(date +%Y%m%d_%H%M%S)"
|
|
||||||
PY_REPORT="$OUT_DIR/fp8_paths_pytorch_${HOST}_${TS}.json"
|
|
||||||
CUBLAS_REPORT="$OUT_DIR/fp8_paths_cublaslt_${HOST}_${TS}.json"
|
|
||||||
COMBINED_REPORT="$OUT_DIR/fp8_paths_combined_${HOST}_${TS}.json"
|
|
||||||
|
|
||||||
"$PYTHON" "$PROJECT_DIR/scripts/pytorch_fp8_path_bench.py" \
|
|
||||||
--matrix-size "$MATRIX_SIZE" \
|
|
||||||
--warmup "$WARMUP" \
|
|
||||||
--iterations "$ITERATIONS" \
|
|
||||||
--gpu-index "$GPU_INDEX" | tee "$PY_REPORT"
|
|
||||||
|
|
||||||
"$NVCC" -O3 -std=c++17 -arch=sm_90 \
|
|
||||||
"$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
|
|
||||||
-lcublasLt -lcublas -o "$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
|
|
||||||
|
|
||||||
"$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" \
|
|
||||||
--matrix-size "$MATRIX_SIZE" \
|
|
||||||
--warmup "$WARMUP" \
|
|
||||||
--iterations "$ITERATIONS" \
|
|
||||||
--first-gpu "$GPU_INDEX" \
|
|
||||||
--gpu-count 1 \
|
|
||||||
--workspace-mb "$WORKSPACE_MB" \
|
|
||||||
--fast-accum 1 | tee "$CUBLAS_REPORT"
|
|
||||||
|
|
||||||
"$PYTHON" - "$PY_REPORT" "$CUBLAS_REPORT" "$COMBINED_REPORT" <<'PY'
|
|
||||||
import json
|
|
||||||
import pathlib
|
|
||||||
import sys
|
|
||||||
|
|
||||||
py_report = pathlib.Path(sys.argv[1])
|
|
||||||
cublas_report = pathlib.Path(sys.argv[2])
|
|
||||||
combined_report = pathlib.Path(sys.argv[3])
|
|
||||||
|
|
||||||
with py_report.open() as f:
|
|
||||||
py_payload = json.load(f)
|
|
||||||
with cublas_report.open() as f:
|
|
||||||
cublas_payload = json.load(f)
|
|
||||||
|
|
||||||
combined = {
|
|
||||||
"source": "fp8_path_comparison",
|
|
||||||
"host": cublas_payload.get("host"),
|
|
||||||
"matrix_size": py_payload.get("matrix_size"),
|
|
||||||
"gpu_index": py_payload.get("gpu_index"),
|
|
||||||
"pytorch": py_payload,
|
|
||||||
"cublaslt": cublas_payload,
|
|
||||||
"results": [],
|
|
||||||
}
|
|
||||||
combined["results"].extend(py_payload.get("results", []))
|
|
||||||
per_gpu = cublas_payload.get("per_gpu", [])
|
|
||||||
if per_gpu:
|
|
||||||
row = dict(per_gpu[0])
|
|
||||||
row.update({
|
|
||||||
"name": "E_direct_cublaslt_fast_accum",
|
|
||||||
"status": "ok",
|
|
||||||
"tflops": row.pop("fp8_tflops"),
|
|
||||||
"matrix_size": cublas_payload.get("matrix_size"),
|
|
||||||
"iterations": cublas_payload.get("iterations"),
|
|
||||||
"warmup": cublas_payload.get("warmup"),
|
|
||||||
"fast_accum": cublas_payload.get("fast_accum"),
|
|
||||||
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager.",
|
|
||||||
})
|
|
||||||
combined["results"].append(row)
|
|
||||||
|
|
||||||
combined_report.write_text(json.dumps(combined, indent=2), encoding="utf-8")
|
|
||||||
print(f"Combined report written to: {combined_report}")
|
|
||||||
PY
|
|
||||||
|
|
||||||
echo "$COMBINED_REPORT"
|
|
||||||
Loading…
x
Reference in New Issue
Block a user