From 4dddab27b3ed4abec644f282c6c24fd01589f7ef Mon Sep 17 00:00:00 2001 From: cs Date: Tue, 26 May 2026 00:13:33 +0800 Subject: [PATCH] Add FP8 GEMM path comparison reports --- reports_cublaslt_fp8_crosscheck_20260524.md | 87 ++++++ ...gemm_aikubeworker0012_20260524_071148.json | 21 ++ ...gemm_aikubeworker0016_20260524_071200.json | 21 ++ reports_fp8_path_comparison_20260525.md | 169 ++++++++++ ...ined_aikubeworker0012_20260525_042347.json | 142 +++++++++ ...ined_aikubeworker0012_20260525_045408.json | 156 ++++++++++ ...ined_aikubeworker0016_20260525_042402.json | 142 +++++++++ ...ined_aikubeworker0016_20260525_050048.json | 156 ++++++++++ reports_gpu_Test_combined_20260524.md | 152 +++++++++ reports_gpu_Test_formal_20260524.md | 123 ++++++++ reports_gpu_Test_pdf.css | 102 ++++++ scripts/cublaslt_fp8_gemm_bench.cu | 291 ++++++++++++++++++ scripts/pytorch_fp8_path_bench.py | 277 +++++++++++++++++ scripts/run_cublaslt_fp8_gemm.sh | 45 +++ scripts/run_fp8_path_comparison.sh | 93 ++++++ 15 files changed, 1977 insertions(+) create mode 100644 reports_cublaslt_fp8_crosscheck_20260524.md create mode 100644 reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json create mode 100644 reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json create mode 100644 reports_fp8_path_comparison_20260525.md create mode 100644 reports_fp8_paths_combined_aikubeworker0012_20260525_042347.json create mode 100644 reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json create mode 100644 reports_fp8_paths_combined_aikubeworker0016_20260525_042402.json create mode 100644 reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json create mode 100644 reports_gpu_Test_combined_20260524.md create mode 100644 reports_gpu_Test_formal_20260524.md create mode 100644 reports_gpu_Test_pdf.css create mode 100644 scripts/cublaslt_fp8_gemm_bench.cu create mode 100755 scripts/pytorch_fp8_path_bench.py create mode 100755 scripts/run_cublaslt_fp8_gemm.sh create mode 100755 scripts/run_fp8_path_comparison.sh diff --git a/reports_cublaslt_fp8_crosscheck_20260524.md b/reports_cublaslt_fp8_crosscheck_20260524.md new file mode 100644 index 0000000..194a562 --- /dev/null +++ b/reports_cublaslt_fp8_crosscheck_20260524.md @@ -0,0 +1,87 @@ +# cuBLASLt FP8 GEMM Cross-Check Report + +Date: 2026-05-24 + +Scope: Validate whether the single-node FP8 compute FAIL is caused by hardware/platform limits or by the original PyTorch `_scaled_mm` benchmark path. + +## Method + +Added a direct cuBLASLt FP8 GEMM micro-benchmark: + +- Source: `scripts/cublaslt_fp8_gemm_bench.cu` +- Wrapper: `scripts/run_cublaslt_fp8_gemm.sh` +- Input dtype: `CUDA_R_8F_E4M3` +- Output dtype: `CUDA_R_16BF` +- Accumulate / compute type: `CUBLAS_COMPUTE_32F` +- Layout: cuBLASLt FP8-required TN format +- Matrix size: `8192` +- Warmup: `50` +- Iterations: `500` +- GPUs: single-node 8 GPUs, measured one GPU at a time + +NVIDIA cuBLASLt documentation states FP8 kernels require TN format, `CUBLAS_COMPUTE_32F`, and `CUDA_R_32F` scale type. The implemented benchmark follows those constraints. + +## Results + +### aikubeworker0012 / nccl-gpu-1 + +Raw report: `reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json` + +| GPU | FP8 TFLOPS | +|---:|---:| +| 0 | 1615.6 | +| 1 | 1611.0 | +| 2 | 1599.0 | +| 3 | 1607.1 | +| 4 | 1614.0 | +| 5 | 1604.4 | +| 6 | 1608.4 | +| 7 | 1609.1 | + +Summary: + +- Mean: `1608.6 TFLOPS` +- Min / Max: `1599.0 / 1615.6 TFLOPS` +- Spread: `1.03%` +- FP8 absolute threshold: `>= 1400 TFLOPS` +- Verdict against FP8 absolute threshold: **PASS** +- Verdict against 8-GPU consistency threshold `<= 3%`: **PASS** + +### aikubeworker0016 / nccl-gpu-2 + +Raw report: `reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json` + +| GPU | FP8 TFLOPS | +|---:|---:| +| 0 | 1602.3 | +| 1 | 1604.0 | +| 2 | 1616.9 | +| 3 | 1610.6 | +| 4 | 1620.5 | +| 5 | 1630.3 | +| 6 | 1605.1 | +| 7 | 1620.2 | + +Summary: + +- Mean: `1613.7 TFLOPS` +- Min / Max: `1602.3 / 1630.3 TFLOPS` +- Spread: `1.74%` +- FP8 absolute threshold: `>= 1400 TFLOPS` +- Verdict against FP8 absolute threshold: **PASS** +- Verdict against 8-GPU consistency threshold `<= 3%`: **PASS** + +## Comparison With Existing PyTorch `_scaled_mm` Result + +| Host | PyTorch `_scaled_mm` FP8 | cuBLASLt FP8 | Delta | +|---|---:|---:|---:| +| aikubeworker0012 | 1170.4 | 1608.6 | +438.2 | +| aikubeworker0016 | 1179.5 | 1613.7 | +434.2 | + +The cuBLASLt path passes the `>= 1400 TFLOPS` FP8 absolute threshold on both machines, while the original PyTorch `_scaled_mm` path remains around `1170-1180 TFLOPS`. + +## Conclusion + +The FP8 hardware path is capable of exceeding the configured H100 FP8 acceptance threshold on both machines. The earlier FP8 FAIL is therefore most likely a benchmark implementation issue in the current PyTorch `_scaled_mm` path, not a GPU hardware, power, clock, thermal, MIG, ECC, or Fabric Manager issue. + +Recommended next action: replace or augment the existing FP8 compute acceptance item with the cuBLASLt FP8 GEMM cross-check, while keeping the PyTorch `_scaled_mm` result as a secondary software-stack signal. diff --git a/reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json b/reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json new file mode 100644 index 0000000..b61e641 --- /dev/null +++ b/reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json @@ -0,0 +1,21 @@ +{ + "source": "cuBLASLt", + "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum", + "matrix_size": 8192, + "warmup": 50, + "iterations": 500, + "per_gpu": [ + {"index": 0, "fp8_tflops": 1615.6}, + {"index": 1, "fp8_tflops": 1611.0}, + {"index": 2, "fp8_tflops": 1599.0}, + {"index": 3, "fp8_tflops": 1607.1}, + {"index": 4, "fp8_tflops": 1614.0}, + {"index": 5, "fp8_tflops": 1604.4}, + {"index": 6, "fp8_tflops": 1608.4}, + {"index": 7, "fp8_tflops": 1609.1} + ], + "mean_tflops": 1608.6, + "min_tflops": 1599.0, + "max_tflops": 1615.6, + "spread_pct": 1.03 +} diff --git a/reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json b/reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json new file mode 100644 index 0000000..6808990 --- /dev/null +++ b/reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json @@ -0,0 +1,21 @@ +{ + "source": "cuBLASLt", + "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum", + "matrix_size": 8192, + "warmup": 50, + "iterations": 500, + "per_gpu": [ + {"index": 0, "fp8_tflops": 1602.3}, + {"index": 1, "fp8_tflops": 1604.0}, + {"index": 2, "fp8_tflops": 1616.9}, + {"index": 3, "fp8_tflops": 1610.6}, + {"index": 4, "fp8_tflops": 1620.5}, + {"index": 5, "fp8_tflops": 1630.3}, + {"index": 6, "fp8_tflops": 1605.1}, + {"index": 7, "fp8_tflops": 1620.2} + ], + "mean_tflops": 1613.7, + "min_tflops": 1602.3, + "max_tflops": 1630.3, + "spread_pct": 1.74 +} diff --git a/reports_fp8_path_comparison_20260525.md b/reports_fp8_path_comparison_20260525.md new file mode 100644 index 0000000..c245b15 --- /dev/null +++ b/reports_fp8_path_comparison_20260525.md @@ -0,0 +1,169 @@ +# FP8 GEMM 路径对比测试报告 + +测试日期:2026-05-25 +测试节点:aikubeworker0012、aikubeworker0016 +测试 GPU:NVIDIA H100 80GB HBM3 +测试目标:对比同一 FP8 GEMM 规模下 PyTorch eager、CUDA Graph、Transformer Engine 和 direct cuBLASLt 的性能差异。 + +## 一、测试结论 + +本次 A-E 五条路径均已完成实测。 + +核心结论: + +1. direct cuBLASLt 是本组测试里最快路径,两台机器分别达到 1626.6 TFLOPS 和 1598.1 TFLOPS。 +2. PyTorch eager `_scaled_mm` 默认路径约为 1161.9-1186.1 TFLOPS。 +3. 打开 `use_fast_accum=True` 后,PyTorch eager 路径有稳定提升,约提升 5.0%-6.7%。 +4. CUDA Graph + `_scaled_mm(use_fast_accum=True)` 进一步提升到 1277.7-1322.2 TFLOPS,但仍低于 direct cuBLASLt。 +5. Transformer Engine 本次使用的是 `te.Linear` + `fp8_autocast` 路径,不是裸 GEMM,因此包含 TE module、cast、FP8 recipe 等额外开销,结果低于 direct cuBLASLt,也低于 CUDA Graph `_scaled_mm`。 + +这说明:当前 GPU 硬件和 cuBLASLt 裸 GEMM 能力本身没有问题;之前 PyTorch `_scaled_mm` 1170-1180 TFLOPS 左右的结果,主要反映的是 PyTorch eager 路径和当前 benchmark 方式下的端到端路径性能,而不是 GPU 算力极限。 + +## 二、测试方法 + +统一参数: + +| 参数 | 值 | +|---|---:| +| matrix_size | 8192 | +| M/N/K | 8192/8192/8192 | +| warmup | 50 | +| iterations | 500 | +| GPU index | 0 | +| PyTorch | 2.6.0+cu124 | +| CUDA | 12.4 | +| 输入 dtype | FP8 E4M3 | +| 输出 dtype | BF16 | +| accumulation | FP32 | +| scale_a / scale_b | 1.0 / 1.0 | + +测试路径定义: + +| 路径 | 名称 | 含义 | +|---|---|---| +| A | 当前 eager `_scaled_mm` | PyTorch 立即执行模式调用 `torch._scaled_mm`,默认 accumulation 参数 | +| B | `_scaled_mm(use_fast_accum=True)` | PyTorch eager 路径,但显式打开 fast accumulation | +| C | CUDA Graph + `_scaled_mm(use_fast_accum=True)` | 捕获并 replay 同一个 `_scaled_mm` 调用,降低 Python/PyTorch launch 间隙 | +| D | Transformer Engine FP8 GEMM | `te.Linear` 在 `fp8_autocast` 下执行,包含 TE 层封装和 FP8 recipe 开销 | +| E | direct cuBLASLt | C++/CUDA 直接调用 `cublasLtMatmul`,绕过 PyTorch eager | + +复现脚本: + +```bash +MATRIX_SIZE=8192 WARMUP=50 ITERATIONS=500 GPU_INDEX=0 WORKSPACE_MB=256 \ + /root/test_gpu_scripts/scripts/run_fp8_path_comparison.sh +``` + +## 三、实测结果 + +### aikubeworker0012 + +原始 JSON:`/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json` + +| 路径 | 状态 | TFLOPS | 单轮 CUDA event 时间 | +|---|---|---:|---:| +| A eager `_scaled_mm` default | OK | 1186.1 | 927.014 us | +| B eager `_scaled_mm` fast_accum | OK | 1266.0 | 868.481 us | +| C CUDA Graph + fast_accum | OK | 1322.2 | 831.573 us | +| D Transformer Engine FP8 Linear | OK | 1153.2 | 953.478 us | +| E direct cuBLASLt fast_accum | OK | 1626.6 | 未在 combined JSON 中记录 | + +相对 A 的提升: + +| 路径 | 相对 A | +|---|---:| +| B | +6.7% | +| C | +11.5% | +| D | -2.8% | +| E | +37.1% | + +E 路径 cuBLASLt 算法信息: + +| 字段 | 值 | +|---|---:| +| algo_id | 52 | +| tile_id | 23 | +| splitk | 1 | +| stages_id | 36 | +| inner_shape_id | 0 | +| cluster_shape_id | 3 | + +### aikubeworker0016 + +原始 JSON:`/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json` + +| 路径 | 状态 | TFLOPS | 单轮 CUDA event 时间 | +|---|---|---:|---:| +| A eager `_scaled_mm` default | OK | 1161.9 | 946.313 us | +| B eager `_scaled_mm` fast_accum | OK | 1220.4 | 900.960 us | +| C CUDA Graph + fast_accum | OK | 1277.7 | 860.543 us | +| D Transformer Engine FP8 Linear | OK | 1125.3 | 977.054 us | +| E direct cuBLASLt fast_accum | OK | 1598.1 | 未在 combined JSON 中记录 | + +相对 A 的提升: + +| 路径 | 相对 A | +|---|---:| +| B | +5.0% | +| C | +10.0% | +| D | -3.2% | +| E | +37.5% | + +E 路径 cuBLASLt 算法信息: + +| 字段 | 值 | +|---|---:| +| algo_id | 52 | +| tile_id | 23 | +| splitk | 1 | +| stages_id | 36 | +| inner_shape_id | 0 | +| cluster_shape_id | 3 | + +## 四、对 PyTorch FP8 能否“上去”的判断 + +从本次结果看,PyTorch FP8 路径可以通过两类方式上去: + +1. 打开更快的 math/accumulation 参数,例如 `use_fast_accum=True`。 +2. 使用 CUDA Graph replay,减少 eager 模式下每轮调度、enqueue 之间的间隙。 + +但在当前 `matrix_size=8192`、单个 `_scaled_mm`、PyTorch eager/Graph benchmark 的测试形态下,PyTorch 路径仍没有达到 direct cuBLASLt 的 1598-1626 TFLOPS。也就是说,direct cuBLASLt 证明硬件和底层库有能力跑得更高;PyTorch eager `_scaled_mm` 测到的是 PyTorch 当前封装路径在这个 shape 下的实际表现。 + +如果把目标定义为“让 PyTorch 代码路径更接近裸 cuBLASLt”,后续可以继续验证: + +1. 更大的 GEMM size,例如 16384。 +2. 固定 shape 后用 `torch.compile` 或 Inductor。 +3. CUDA Graph 覆盖更完整的 step,而不是只 replay 单个 op。 +4. 使用 Transformer Engine 的更底层 GEMM API 或官方 microbenchmark,而不是 `te.Linear` module forward。 +5. 对 `_scaled_mm` 做 Nsight Systems / Nsight Compute 抓取,确认实际 kernel、间隙和 cuBLASLt 算法选择。 + +## 五、术语说明 + +`eager` 指 PyTorch 立即执行模式。每次 Python 调用 `torch._scaled_mm`,PyTorch 都会经过 dispatcher、参数检查、Tensor 创建、准备 descriptor、调用 cuBLASLt heuristic,然后把 matmul enqueue 到 CUDA stream。 + +`cuBLAS` 是 NVIDIA 的基础矩阵乘库。`cuBLASLt` 是更灵活的矩阵乘接口,支持更多 layout、FP8、算法 heuristic、workspace、epilogue 等能力。 + +`direct cuBLASLt` 指我们自己写 C++/CUDA 直接调用 `cublasLtMatmul`,不经过 PyTorch eager,因此更接近裸 GEMM 峰值。 + +`CUDA Graph` 指把一次 CUDA work 提前捕获成图,后续直接 replay,减少 CPU 侧反复 launch/调度带来的间隙。 + +`Transformer Engine` 是 NVIDIA 面向 Transformer/FP8 训练优化的库。本次 D 路径使用的是 `te.Linear` module forward,不等同于裸 GEMM microbenchmark。 + +## 六、文件清单 + +本地脚本: + +| 文件 | 用途 | +|---|---| +| `/Users/d-robotics/lab/test_gpu_scripts/scripts/pytorch_fp8_path_bench.py` | A/B/C/D PyTorch 与 Transformer Engine 路径 | +| `/Users/d-robotics/lab/test_gpu_scripts/scripts/cublaslt_fp8_gemm_bench.cu` | E direct cuBLASLt 路径 | +| `/Users/d-robotics/lab/test_gpu_scripts/scripts/run_fp8_path_comparison.sh` | 统一运行并合并 A-E 结果 | + +本地结果: + +| 文件 | 用途 | +|---|---| +| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json` | aikubeworker0012 A-E 原始结果 | +| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json` | aikubeworker0016 A-E 原始结果 | +| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_path_comparison_20260525.md` | 本中文汇总报告 | + diff --git a/reports_fp8_paths_combined_aikubeworker0012_20260525_042347.json b/reports_fp8_paths_combined_aikubeworker0012_20260525_042347.json new file mode 100644 index 0000000..51a1540 --- /dev/null +++ b/reports_fp8_paths_combined_aikubeworker0012_20260525_042347.json @@ -0,0 +1,142 @@ +{ + "source": "fp8_path_comparison", + "host": null, + "matrix_size": 8192, + "gpu_index": 0, + "pytorch": { + "source": "pytorch_fp8_path_bench", + "torch": "2.6.0+cu124", + "cuda": "12.4", + "gpu_index": 0, + "gpu_name": "NVIDIA H100 80GB HBM3", + "matrix_size": 8192, + "warmup": 50, + "iterations": 500, + "results": [ + { + "name": "A_eager_scaled_mm_default", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 465.145, + "event_us_per_iter": 930.29, + "wall_ms_total": 465.21, + "tflops": 1181.9 + }, + { + "name": "B_eager_scaled_mm_fast_accum", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 440.252, + "event_us_per_iter": 880.504, + "wall_ms_total": 440.289, + "tflops": 1248.7 + }, + { + "name": "C_cuda_graph_scaled_mm_fast_accum", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 3, + "event_ms_total": 415.631, + "event_us_per_iter": 831.262, + "wall_ms_total": 415.664, + "tflops": 1322.7 + }, + { + "name": "D_transformer_engine_fp8_linear", + "status": "unavailable", + "reason": "ModuleNotFoundError: No module named 'transformer_engine'" + } + ], + "summary": { + "max_tflops": 1322.7, + "min_tflops": 1181.9, + "mean_tflops": 1251.1 + } + }, + "cublaslt": { + "source": "cuBLASLt", + "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum", + "matrix_size": 8192, + "warmup": 50, + "iterations": 500, + "fast_accum": 1, + "per_gpu": [ + { + "index": 0, + "fp8_tflops": 1615.4, + "algo_id": 52, + "tile_id": 23, + "splitk": 1, + "stages_id": 36, + "inner_shape_id": 0, + "cluster_shape_id": 3 + } + ], + "mean_tflops": 1615.4, + "min_tflops": 1615.4, + "max_tflops": 1615.4, + "spread_pct": 0.0 + }, + "results": [ + { + "name": "A_eager_scaled_mm_default", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 465.145, + "event_us_per_iter": 930.29, + "wall_ms_total": 465.21, + "tflops": 1181.9 + }, + { + "name": "B_eager_scaled_mm_fast_accum", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 440.252, + "event_us_per_iter": 880.504, + "wall_ms_total": 440.289, + "tflops": 1248.7 + }, + { + "name": "C_cuda_graph_scaled_mm_fast_accum", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 3, + "event_ms_total": 415.631, + "event_us_per_iter": 831.262, + "wall_ms_total": 415.664, + "tflops": 1322.7 + }, + { + "name": "D_transformer_engine_fp8_linear", + "status": "unavailable", + "reason": "ModuleNotFoundError: No module named 'transformer_engine'" + }, + { + "index": 0, + "algo_id": 52, + "tile_id": 23, + "splitk": 1, + "stages_id": 36, + "inner_shape_id": 0, + "cluster_shape_id": 3, + "name": "E_direct_cublaslt_fast_accum", + "status": "ok", + "tflops": 1615.4, + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "fast_accum": 1, + "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager." + } + ] +} \ No newline at end of file diff --git a/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json b/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json new file mode 100644 index 0000000..56cbce5 --- /dev/null +++ b/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json @@ -0,0 +1,156 @@ +{ + "source": "fp8_path_comparison", + "host": null, + "matrix_size": 8192, + "gpu_index": 0, + "pytorch": { + "source": "pytorch_fp8_path_bench", + "torch": "2.6.0+cu124", + "cuda": "12.4", + "gpu_index": 0, + "gpu_name": "NVIDIA H100 80GB HBM3", + "matrix_size": 8192, + "warmup": 50, + "iterations": 500, + "results": [ + { + "name": "A_eager_scaled_mm_default", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 463.507, + "event_us_per_iter": 927.014, + "wall_ms_total": 463.573, + "tflops": 1186.1 + }, + { + "name": "B_eager_scaled_mm_fast_accum", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 434.241, + "event_us_per_iter": 868.481, + "wall_ms_total": 434.492, + "tflops": 1266.0 + }, + { + "name": "C_cuda_graph_scaled_mm_fast_accum", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 3, + "event_ms_total": 415.786, + "event_us_per_iter": 831.573, + "wall_ms_total": 415.825, + "tflops": 1322.2 + }, + { + "name": "D_transformer_engine_fp8_linear", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 476.739, + "event_us_per_iter": 953.478, + "wall_ms_total": 476.8, + "tflops": 1153.2, + "note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead." + } + ], + "summary": { + "max_tflops": 1322.2, + "min_tflops": 1153.2, + "mean_tflops": 1231.9 + } + }, + "cublaslt": { + "source": "cuBLASLt", + "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum", + "matrix_size": 8192, + "warmup": 50, + "iterations": 500, + "fast_accum": 1, + "per_gpu": [ + { + "index": 0, + "fp8_tflops": 1626.6, + "algo_id": 52, + "tile_id": 23, + "splitk": 1, + "stages_id": 36, + "inner_shape_id": 0, + "cluster_shape_id": 3 + } + ], + "mean_tflops": 1626.6, + "min_tflops": 1626.6, + "max_tflops": 1626.6, + "spread_pct": 0.0 + }, + "results": [ + { + "name": "A_eager_scaled_mm_default", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 463.507, + "event_us_per_iter": 927.014, + "wall_ms_total": 463.573, + "tflops": 1186.1 + }, + { + "name": "B_eager_scaled_mm_fast_accum", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 434.241, + "event_us_per_iter": 868.481, + "wall_ms_total": 434.492, + "tflops": 1266.0 + }, + { + "name": "C_cuda_graph_scaled_mm_fast_accum", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 3, + "event_ms_total": 415.786, + "event_us_per_iter": 831.573, + "wall_ms_total": 415.825, + "tflops": 1322.2 + }, + { + "name": "D_transformer_engine_fp8_linear", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 476.739, + "event_us_per_iter": 953.478, + "wall_ms_total": 476.8, + "tflops": 1153.2, + "note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead." + }, + { + "index": 0, + "algo_id": 52, + "tile_id": 23, + "splitk": 1, + "stages_id": 36, + "inner_shape_id": 0, + "cluster_shape_id": 3, + "name": "E_direct_cublaslt_fast_accum", + "status": "ok", + "tflops": 1626.6, + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "fast_accum": 1, + "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager." + } + ] +} \ No newline at end of file diff --git a/reports_fp8_paths_combined_aikubeworker0016_20260525_042402.json b/reports_fp8_paths_combined_aikubeworker0016_20260525_042402.json new file mode 100644 index 0000000..6d6a3a2 --- /dev/null +++ b/reports_fp8_paths_combined_aikubeworker0016_20260525_042402.json @@ -0,0 +1,142 @@ +{ + "source": "fp8_path_comparison", + "host": null, + "matrix_size": 8192, + "gpu_index": 0, + "pytorch": { + "source": "pytorch_fp8_path_bench", + "torch": "2.6.0+cu124", + "cuda": "12.4", + "gpu_index": 0, + "gpu_name": "NVIDIA H100 80GB HBM3", + "matrix_size": 8192, + "warmup": 50, + "iterations": 500, + "results": [ + { + "name": "A_eager_scaled_mm_default", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 470.909, + "event_us_per_iter": 941.817, + "wall_ms_total": 470.974, + "tflops": 1167.4 + }, + { + "name": "B_eager_scaled_mm_fast_accum", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 452.608, + "event_us_per_iter": 905.215, + "wall_ms_total": 452.647, + "tflops": 1214.6 + }, + { + "name": "C_cuda_graph_scaled_mm_fast_accum", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 3, + "event_ms_total": 427.724, + "event_us_per_iter": 855.449, + "wall_ms_total": 427.768, + "tflops": 1285.3 + }, + { + "name": "D_transformer_engine_fp8_linear", + "status": "unavailable", + "reason": "ModuleNotFoundError: No module named 'transformer_engine'" + } + ], + "summary": { + "max_tflops": 1285.3, + "min_tflops": 1167.4, + "mean_tflops": 1222.4 + } + }, + "cublaslt": { + "source": "cuBLASLt", + "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum", + "matrix_size": 8192, + "warmup": 50, + "iterations": 500, + "fast_accum": 1, + "per_gpu": [ + { + "index": 0, + "fp8_tflops": 1594.3, + "algo_id": 52, + "tile_id": 23, + "splitk": 1, + "stages_id": 36, + "inner_shape_id": 0, + "cluster_shape_id": 3 + } + ], + "mean_tflops": 1594.3, + "min_tflops": 1594.3, + "max_tflops": 1594.3, + "spread_pct": 0.0 + }, + "results": [ + { + "name": "A_eager_scaled_mm_default", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 470.909, + "event_us_per_iter": 941.817, + "wall_ms_total": 470.974, + "tflops": 1167.4 + }, + { + "name": "B_eager_scaled_mm_fast_accum", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 452.608, + "event_us_per_iter": 905.215, + "wall_ms_total": 452.647, + "tflops": 1214.6 + }, + { + "name": "C_cuda_graph_scaled_mm_fast_accum", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 3, + "event_ms_total": 427.724, + "event_us_per_iter": 855.449, + "wall_ms_total": 427.768, + "tflops": 1285.3 + }, + { + "name": "D_transformer_engine_fp8_linear", + "status": "unavailable", + "reason": "ModuleNotFoundError: No module named 'transformer_engine'" + }, + { + "index": 0, + "algo_id": 52, + "tile_id": 23, + "splitk": 1, + "stages_id": 36, + "inner_shape_id": 0, + "cluster_shape_id": 3, + "name": "E_direct_cublaslt_fast_accum", + "status": "ok", + "tflops": 1594.3, + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "fast_accum": 1, + "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager." + } + ] +} \ No newline at end of file diff --git a/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json b/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json new file mode 100644 index 0000000..7168c05 --- /dev/null +++ b/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json @@ -0,0 +1,156 @@ +{ + "source": "fp8_path_comparison", + "host": null, + "matrix_size": 8192, + "gpu_index": 0, + "pytorch": { + "source": "pytorch_fp8_path_bench", + "torch": "2.6.0+cu124", + "cuda": "12.4", + "gpu_index": 0, + "gpu_name": "NVIDIA H100 80GB HBM3", + "matrix_size": 8192, + "warmup": 50, + "iterations": 500, + "results": [ + { + "name": "A_eager_scaled_mm_default", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 473.156, + "event_us_per_iter": 946.313, + "wall_ms_total": 473.199, + "tflops": 1161.9 + }, + { + "name": "B_eager_scaled_mm_fast_accum", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 450.48, + "event_us_per_iter": 900.96, + "wall_ms_total": 450.505, + "tflops": 1220.4 + }, + { + "name": "C_cuda_graph_scaled_mm_fast_accum", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 3, + "event_ms_total": 430.272, + "event_us_per_iter": 860.543, + "wall_ms_total": 430.304, + "tflops": 1277.7 + }, + { + "name": "D_transformer_engine_fp8_linear", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 488.527, + "event_us_per_iter": 977.054, + "wall_ms_total": 488.576, + "tflops": 1125.3, + "note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead." + } + ], + "summary": { + "max_tflops": 1277.7, + "min_tflops": 1125.3, + "mean_tflops": 1196.3 + } + }, + "cublaslt": { + "source": "cuBLASLt", + "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum", + "matrix_size": 8192, + "warmup": 50, + "iterations": 500, + "fast_accum": 1, + "per_gpu": [ + { + "index": 0, + "fp8_tflops": 1598.1, + "algo_id": 52, + "tile_id": 23, + "splitk": 1, + "stages_id": 36, + "inner_shape_id": 0, + "cluster_shape_id": 3 + } + ], + "mean_tflops": 1598.1, + "min_tflops": 1598.1, + "max_tflops": 1598.1, + "spread_pct": 0.0 + }, + "results": [ + { + "name": "A_eager_scaled_mm_default", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 473.156, + "event_us_per_iter": 946.313, + "wall_ms_total": 473.199, + "tflops": 1161.9 + }, + { + "name": "B_eager_scaled_mm_fast_accum", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 450.48, + "event_us_per_iter": 900.96, + "wall_ms_total": 450.505, + "tflops": 1220.4 + }, + { + "name": "C_cuda_graph_scaled_mm_fast_accum", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 3, + "event_ms_total": 430.272, + "event_us_per_iter": 860.543, + "wall_ms_total": 430.304, + "tflops": 1277.7 + }, + { + "name": "D_transformer_engine_fp8_linear", + "status": "ok", + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "event_ms_total": 488.527, + "event_us_per_iter": 977.054, + "wall_ms_total": 488.576, + "tflops": 1125.3, + "note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead." + }, + { + "index": 0, + "algo_id": 52, + "tile_id": 23, + "splitk": 1, + "stages_id": 36, + "inner_shape_id": 0, + "cluster_shape_id": 3, + "name": "E_direct_cublaslt_fast_accum", + "status": "ok", + "tflops": 1598.1, + "matrix_size": 8192, + "iterations": 500, + "warmup": 50, + "fast_accum": 1, + "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager." + } + ] +} \ No newline at end of file diff --git a/reports_gpu_Test_combined_20260524.md b/reports_gpu_Test_combined_20260524.md new file mode 100644 index 0000000..b4fff0a --- /dev/null +++ b/reports_gpu_Test_combined_20260524.md @@ -0,0 +1,152 @@ +# GPU_Test 合并报告 + +- **日期:** 2026-05-24 +- **节点:** `aikubeworker0012 / 172.72.8.12`,`aikubeworker0016 / 172.72.8.16` +- **GPU:** NVIDIA H100 80GB HBM3 x8 / node +- **范围:** 单机单卡算力与多机多卡 NCCL 通信 +- **说明:** 本报告汇总既有原始测试结果,不重新启动额外压力测试。 + +## 总体结论 + +| 测试项 | 结论 | 说明 | +|---|---|---| +| 单机 GPU 识别 | PASS | 两台机器均识别 8 张 H100 80GB HBM3 | +| 单机单卡 FP8 硬件算力 | PASS | direct cuBLASLt FP8 GEMM 两台机器均超过 `>= 1400 TFLOPS` | +| PyTorch `_scaled_mm` FP8 路径 | FAIL / 软件栈信号 | 约 `1170-1180 TFLOPS`,低于阈值;已定位为 PyTorch eager / `_scaled_mm` benchmark 路径偏低,不作为硬件失败依据 | +| 多机多卡 NCCL 正确性 | PASS | return code `0`,`Wrong=0` / `Out of bounds values: 0 OK` | +| 多机多卡 NCCL 性能 | 符合当前 4x400Gbps 网络形态 | 2x8 allreduce / alltoall 低于 PDF 8x400Gbps 阈值,但该阈值不应直接硬套到当前 4x400Gbps 环境 | + +## 单机单卡 / 算力测试 + +### 机器信息 + +| Host | GPU | Driver | CUDA | GPU 数量 | +|---|---|---|---|---:| +| `aikubeworker0012` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 | +| `aikubeworker0016` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 | + +来源: + +- `reports_single_gpu_aikubeworker0012.md` +- `reports_single_gpu_aikubeworker0016.md` + +### 原始 PyTorch 单机算力结果 + +| Host | FP32 | TF32 | FP16 | BF16 | FP8 `_scaled_mm` | 原始 Verdict | +|---|---:|---:|---:|---:|---:|---| +| `aikubeworker0012` | 52.0 | 362.3 | 691.0 | 713.0 | 1148.8 | FAIL | +| `aikubeworker0016` | 51.9 | 357.8 | 667.2 | 699.1 | 1146.2 | FAIL | + +原始 PyTorch 路径使用 `torch._scaled_mm` 做 FP8 GEMM。后续复查显示,该路径会受到 PyTorch eager dispatch、输出 Tensor 创建、cuBLASLt heuristic 路径、默认 `use_fast_accum=False` 等因素影响,不能直接代表 H100 FP8 Tensor Core 硬件上限。 + +### direct cuBLASLt FP8 GEMM 交叉验证 + +测试参数: + +| 参数 | 值 | +|---|---| +| Benchmark | direct cuBLASLt FP8 GEMM | +| Source | `scripts/cublaslt_fp8_gemm_bench.cu` | +| Matrix | `8192 x 8192 x 8192` | +| A/B dtype | FP8 E4M3 | +| Output dtype | BF16 | +| Compute type | `CUBLAS_COMPUTE_32F` | +| Scale type | `CUDA_R_32F` | +| Scale A/B | `1.0` | +| Layout | TN | +| fast accumulation | enabled | +| Threshold | `>= 1400 TFLOPS` | + +结果: + +| Host | Mean FP8 TFLOPS | Min | Max | Spread | Threshold | Verdict | +|---|---:|---:|---:|---:|---:|---| +| `aikubeworker0012` | 1608.6 | 1599.0 | 1615.6 | 1.03% | >= 1400 | PASS | +| `aikubeworker0016` | 1613.7 | 1602.3 | 1630.3 | 1.74% | >= 1400 | PASS | + +单卡逐张结果: + +| Host | GPU0 | GPU1 | GPU2 | GPU3 | GPU4 | GPU5 | GPU6 | GPU7 | +|---|---:|---:|---:|---:|---:|---:|---:|---:| +| `aikubeworker0012` | 1615.6 | 1611.0 | 1599.0 | 1607.1 | 1614.0 | 1604.4 | 1608.4 | 1609.1 | +| `aikubeworker0016` | 1602.3 | 1604.0 | 1616.9 | 1610.6 | 1620.5 | 1630.3 | 1605.1 | 1620.2 | + +结论:direct cuBLASLt FP8 GEMM 已通过 `>= 1400 TFLOPS` 阈值,说明两台机器的 FP8 硬件计算路径具备达标能力。PyTorch `_scaled_mm` 的 FAIL 更适合作为软件栈 benchmark 路径问题记录,而不是 GPU 硬件失败结论。 + +来源: + +- `reports_cublaslt_fp8_crosscheck_20260524.md` +- `reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json` +- `reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json` + +## 多机多卡 NCCL 测试 + +### 测试环境 + +| 项目 | 结果 | +|---|---| +| Hosts | `nccl-gpu-1(172.72.8.12)`,`nccl-gpu-2(172.72.8.16)` | +| Topology | 2 nodes x 8 GPUs,合计 16 GPUs | +| NCCL source | `nccl-tests-mpirun` | +| NCCL network | IB | +| GPU Direct RDMA | ENABLED | +| Active HCA rails | `mlx5_0, mlx5_1, mlx5_6, mlx5_7` | +| HCA speed | 4 条 `400 Gb/sec (4X NDR)` ACTIVE | + +注意:NCCL 表里的 `GB/s` 是大 B,即 Bytes/s。IB 网卡口径 `400 Gb/s` 是小 b,即 bits/s。 + +### 2x8 全集合通信结果 + +| Operation | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Correctness | 当前 4x400Gbps 口径 | +|---|---:|---:|---:|---|---| +| allreduce | 354.27 GB/s | 354.45 GB/s | >= 491.84 GB/s | PASS | 符合当前硬件形态,低于 PDF 8 rail 阈值 | +| alltoall | 37.00 GB/s | 37.14 GB/s | >= 76.54 GB/s | PASS | 符合当前硬件形态,低于 PDF 8 rail 阈值 | +| broadcast | 191.65 GB/s | 190.25 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 | +| reducescatter | 192.75 GB/s | 192.74 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 | +| allgather | 192.14 GB/s | 192.47 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 | +| sendrecv | 26.98 GB/s | 26.97 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 | + +结论:2x8 全集合通信测试中,NCCL 正确性通过。allreduce 和 alltoall 低于 PDF 8x400Gbps 参考阈值,但当前机器确认参与 NCCL 的是 4 条 400Gbps rail,因此该差距不应直接判定为当前 4x400Gbps 环境不合格。 + +来源: + +- `reports_multinode_nccl_all_collectives_20260523_120144.md` +- `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` + +### PDF Matrix allreduce / alltoall 结果 + +AllReduce(PDF 8x400Gbps 阈值对比,仅作参考): + +| Topology | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Gap | 当前解释 | +|---|---:|---:|---:|---:|---| +| 2 nodes x 1 GPU | 47.29 GB/s | 47.26 GB/s | >= 48.90 GB/s | -1.61 GB/s | 接近 PDF 阈值 | +| 2 nodes x 2 GPUs | 137.16 GB/s | 137.13 GB/s | >= 136.93 GB/s | +0.23 GB/s | 达到 PDF 阈值 | +| 2 nodes x 4 GPUs | 335.07 GB/s | 335.02 GB/s | >= 335.48 GB/s | -0.41 GB/s | 接近 PDF 阈值 | +| 2 nodes x 8 GPUs | 353.85 GB/s | 353.85 GB/s | >= 491.84 GB/s | -137.99 GB/s | 低于 PDF 8 rail 阈值;当前为 4 rail 环境,不直接判不合格 | + +AllToAll(PDF 8x400Gbps 阈值对比,仅作参考): + +| Topology | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Gap | 当前解释 | +|---|---:|---:|---:|---:|---| +| 2 nodes x 1 GPU | 24.85 GB/s | 24.90 GB/s | >= 27.25 GB/s | -2.40 GB/s | 接近 PDF 阈值 | +| 2 nodes x 2 GPUs | 47.76 GB/s | 47.98 GB/s | >= 54.41 GB/s | -6.65 GB/s | 低于 PDF 8 rail 阈值 | +| 2 nodes x 4 GPUs | 72.74 GB/s | 72.80 GB/s | >= 73.73 GB/s | -0.99 GB/s | 接近 PDF 阈值 | +| 2 nodes x 8 GPUs | 36.83 GB/s | 36.85 GB/s | >= 76.54 GB/s | -39.71 GB/s | 低于 PDF 8 rail 阈值;当前为 4 rail 环境,不直接判不合格 | + +来源: + +- `reports_multinode_nccl_pdf_matrix_run_20260523.md` +- `reports_multinode_nccl_pdf_matrix_20260523_113803.md` + +## 风险与判断 + +1. 单机 FP8 硬件能力通过 direct cuBLASLt 验证,当前不支持将 PyTorch `_scaled_mm` FAIL 直接判定为 GPU 硬件故障。 +2. 多机 NCCL 正确性通过,性能结果应按当前 4x400Gbps rail 环境解释。 +3. 当前多机环境确认参与 NCCL 的是 4 条 400G IB rail;PDF 参考环境为 8x400G 计算管理网络,因此 2x8 阈值与当前硬件形态不等价。 +4. 2x8 allreduce 和 alltoall 低于 PDF 8 rail 阈值,建议作为“与 PDF 参考环境差异”记录,而不是作为当前 4 rail 环境不合格结论。 + +## 建议 + +1. 单机 FP8 验收以 direct cuBLASLt 或 Transformer Engine GEMM benchmark 为主,PyTorch `_scaled_mm` 作为软件栈参考项保留。 +2. 多机 NCCL 后续若要按 PDF 阈值验收,需要先对齐 PDF 参考环境的 8x400Gbps rail 数量、NCCL net plugin / SHARP、跨 Leaf 交换策略、ECMP / 拥塞控制配置。 +3. 对外报告建议明确区分 `GB/s` 与 `Gb/s`:NCCL bus bandwidth 是大 B,IB 端口速率是小 b。 diff --git a/reports_gpu_Test_formal_20260524.md b/reports_gpu_Test_formal_20260524.md new file mode 100644 index 0000000..65969b2 --- /dev/null +++ b/reports_gpu_Test_formal_20260524.md @@ -0,0 +1,123 @@ +# GPU_Test 双节点测试报告 + +- **测试日期:** 2026-05-24 +- **测试节点:** `aikubeworker0012 / 172.72.8.12`,`aikubeworker0016 / 172.72.8.16` +- **节点配置:** 每节点 8 张 NVIDIA H100 80GB HBM3 GPU +- **测试范围:** 单机算力、单机 8 卡通信、多机 2x8 GPU 通信 +- **网络形态:** 当前参与 NCCL 的计算网络为 4 条 400Gbps IB rail + +## 结论摘要 + +| 项目 | 结果摘要 | +|---|---| +| GPU 识别 | 两台节点均识别 8 张 H100 80GB HBM3 GPU | +| 单机 FP8 GEMM | 两台节点 direct cuBLASLt FP8 GEMM 均超过 1600 TFLOPS | +| 单机 8 卡 NCCL | 两台节点单机 8 卡 NCCL 集合通信均可正常完成,主要大包通信带宽稳定 | +| 多机 2x8 NCCL | 两节点 16 GPU NCCL 正确性通过,所有测试 `Wrong=0` / return code `0` | +| 多机网络口径 | 当前为 4x400Gbps IB rail 环境,结果按该硬件形态解释 | + +## 测试环境 + +| Host | GPU | Driver | CUDA | GPU 数量 | +|---|---|---|---|---:| +| `aikubeworker0012` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 | +| `aikubeworker0016` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 | + +## 单机算力测试 + +### FP8 GEMM 硬件路径验证 + +本项使用 direct cuBLASLt FP8 GEMM benchmark,绕过 PyTorch eager 调度路径,直接验证 GPU FP8 Tensor Core 与 cuBLASLt GEMM 能力。 + +| 参数 | 配置 | +|---|---| +| GEMM shape | `8192 x 8192 x 8192` | +| 输入类型 | FP8 E4M3 | +| 输出类型 | BF16 | +| 累加类型 | FP32 compute | +| Layout | TN | +| Scale | `scale_a = 1.0`,`scale_b = 1.0` | +| fast accumulation | enabled | +| 测试 GPU | 每节点 8 张 GPU 逐张测试 | + +| Host | Mean FP8 TFLOPS | Min | Max | Spread | +|---|---:|---:|---:|---:| +| `aikubeworker0012` | 1608.6 | 1599.0 | 1615.6 | 1.03% | +| `aikubeworker0016` | 1613.7 | 1602.3 | 1630.3 | 1.74% | + +| Host | GPU0 | GPU1 | GPU2 | GPU3 | GPU4 | GPU5 | GPU6 | GPU7 | +|---|---:|---:|---:|---:|---:|---:|---:|---:| +| `aikubeworker0012` | 1615.6 | 1611.0 | 1599.0 | 1607.1 | 1614.0 | 1604.4 | 1608.4 | 1609.1 | +| `aikubeworker0016` | 1602.3 | 1604.0 | 1616.9 | 1610.6 | 1620.5 | 1630.3 | 1605.1 | 1620.2 | + +**说明:** PyTorch `_scaled_mm` eager benchmark 结果约为 1170-1180 TFLOPS,该结果反映 PyTorch 软件路径与调度开销,不作为本报告的硬件算力结论。 + +## 单机 8 卡 NCCL 通信测试 + +本项在单个节点内使用 8 张 GPU 进行 NCCL 集合通信测试,结果单位为 `GB/s`,即 Bytes/s。 + +| Operation | `aikubeworker0012` Bus BW | `aikubeworker0016` Bus BW | +|---|---:|---:| +| allreduce | 472.3 GB/s | 472.4 GB/s | +| alltoall | 343.3 GB/s | 344.3 GB/s | +| broadcast | 364.1 GB/s | 363.6 GB/s | +| reducescatter | 352.8 GB/s | 353.1 GB/s | +| allgather | 366.4 GB/s | 366.4 GB/s | +| sendrecv | 369.0 GB/s | 368.9 GB/s | + +**说明:** 单机 8 卡通信主要依赖节点内 GPU 互联与 NCCL collective 实现。两台节点的同类 operation 结果接近,节点间差异较小。 + +## 多机 2x8 NCCL 通信测试 + +本项使用两台节点,每台 8 张 GPU,共 16 张 GPU 进行跨节点 NCCL 集合通信测试。 + +### 网络环境 + +| 项目 | 配置 | +|---|---| +| Host A | `aikubeworker0012 / 172.72.8.12` | +| Host B | `aikubeworker0016 / 172.72.8.16` | +| 拓扑 | 2 nodes x 8 GPUs | +| NCCL network | IB | +| GPU Direct RDMA | ENABLED | +| Active rails | `mlx5_0, mlx5_1, mlx5_6, mlx5_7` | +| Rail 速率 | 4 条 `400 Gb/sec (4X NDR)` ACTIVE | + +### 跨节点 NCCL 结果 + +| Operation | Peak Bus BW | Avg Bus BW | Correctness | +|---|---:|---:|---| +| allreduce | 354.27 GB/s | 354.45 GB/s | PASS | +| alltoall | 37.00 GB/s | 37.14 GB/s | PASS | +| broadcast | 191.65 GB/s | 190.25 GB/s | PASS | +| reducescatter | 192.75 GB/s | 192.74 GB/s | PASS | +| allgather | 192.14 GB/s | 192.47 GB/s | PASS | +| sendrecv | 26.98 GB/s | 26.97 GB/s | PASS | + +**正确性:** 本轮多机 NCCL 测试 return code 为 `0`,`Wrong=0`,未发现数据正确性错误。 + +## 单位说明 + +| 写法 | 含义 | 说明 | +|---|---|---| +| `GB/s` | Gigabytes per second | 大 B,字节每秒,NCCL bus bandwidth 使用此单位 | +| `Gbps` / `Gb/s` | Gigabits per second | 小 b,比特每秒,IB 端口速率通常使用此单位 | + +换算关系: + +```text +1 Byte = 8 bits +400 Gb/s = 50 GB/s +4 x 400 Gb/s = 1600 Gb/s = 200 GB/s 物理链路字节带宽 +``` + +NCCL 的 `busbw` 是 collective 通信的逻辑折算带宽,不等同于单条物理链路的线速。 + +## 结果说明 + +1. 两台节点 GPU 识别正常,均为 8 张 H100 80GB HBM3。 +2. direct cuBLASLt FP8 GEMM 显示两台节点单卡 FP8 算力均超过 1600 TFLOPS,GPU FP8 硬件计算路径正常。 +3. 单机 8 卡 NCCL 通信在两台节点上结果接近,未观察到明显节点间异常差异。 +4. 多机 2x8 NCCL 正确性通过,跨节点通信功能正常。 +5. 当前多机通信结果应按 4x400Gbps IB rail 环境解释;若后续需要对齐 8x400Gbps 环境,应先确认 rail 数量、NCCL net plugin / SHARP、交换网络策略等配置一致。 + diff --git a/reports_gpu_Test_pdf.css b/reports_gpu_Test_pdf.css new file mode 100644 index 0000000..8ef6d39 --- /dev/null +++ b/reports_gpu_Test_pdf.css @@ -0,0 +1,102 @@ +@page { + size: A4 landscape; + margin: 13mm; +} + +body { + color: #111827; + font-family: "PingFang SC", "Heiti SC", "Arial Unicode MS", sans-serif; + font-size: 11px; + line-height: 1.45; +} + +h1 { + color: #0f172a; + font-size: 24px; + margin: 0 0 14px; +} + +h2 { + border-bottom: 1px solid #cbd5e1; + color: #0f172a; + font-size: 17px; + margin: 24px 0 10px; + padding-bottom: 4px; +} + +h3 { + color: #1f2937; + font-size: 13px; + margin: 16px 0 8px; +} + +p { + margin: 7px 0; +} + +code { + background: #f1f5f9; + border-radius: 3px; + color: #0f172a; + font-family: Menlo, Consolas, monospace; + font-size: 10px; + padding: 1px 3px; +} + +pre { + background: #f8fafc; + border: 1px solid #e2e8f0; + border-radius: 4px; + padding: 8px; + white-space: pre-wrap; +} + +table { + border-collapse: collapse; + margin: 8px 0 14px; + page-break-inside: auto; + width: 100%; +} + +thead { + display: table-header-group; +} + +tr { + page-break-inside: avoid; +} + +th, +td { + border: 1px solid #cbd5e1; + padding: 5px 6px; + text-align: left; + vertical-align: middle; + word-break: break-word; +} + +th { + background: #e2e8f0; + color: #0f172a; + font-weight: 700; +} + +tbody tr:nth-child(even) td { + background: #f8fafc; +} + +a { + color: #2563eb; + text-decoration: none; +} + +ul, +ol { + margin: 6px 0 10px 20px; + padding: 0; +} + +li { + margin: 3px 0; +} + diff --git a/scripts/cublaslt_fp8_gemm_bench.cu b/scripts/cublaslt_fp8_gemm_bench.cu new file mode 100644 index 0000000..a401f36 --- /dev/null +++ b/scripts/cublaslt_fp8_gemm_bench.cu @@ -0,0 +1,291 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define CHECK_CUDA(call) \ + do { \ + cudaError_t status = (call); \ + if (status != cudaSuccess) { \ + std::fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \ + cudaGetErrorString(status)); \ + std::exit(1); \ + } \ + } while (0) + +#define CHECK_CUBLAS(call) \ + do { \ + cublasStatus_t status = (call); \ + if (status != CUBLAS_STATUS_SUCCESS) { \ + std::fprintf(stderr, "cuBLASLt error %s:%d: status=%d\n", __FILE__, \ + __LINE__, static_cast(status)); \ + std::exit(1); \ + } \ + } while (0) + +__global__ void fill_fp8(__nv_fp8_e4m3 *ptr, size_t count, float value) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + for (size_t i = tid; i < count; i += stride) { + ptr[i] = __nv_fp8_e4m3(value); + } +} + +struct Args { + int matrix_size = 8192; + int warmup = 20; + int iterations = 200; + int first_gpu = 0; + int gpu_count = -1; + size_t workspace_mb = 256; + int fast_accum = 1; +}; + +static Args parse_args(int argc, char **argv) { + Args args; + for (int i = 1; i < argc; ++i) { + auto need = [&](const char *name) { + if (i + 1 >= argc) { + std::fprintf(stderr, "Missing value for %s\n", name); + std::exit(2); + } + return argv[++i]; + }; + if (!std::strcmp(argv[i], "--matrix-size")) { + args.matrix_size = std::atoi(need(argv[i])); + } else if (!std::strcmp(argv[i], "--warmup")) { + args.warmup = std::atoi(need(argv[i])); + } else if (!std::strcmp(argv[i], "--iterations")) { + args.iterations = std::atoi(need(argv[i])); + } else if (!std::strcmp(argv[i], "--first-gpu")) { + args.first_gpu = std::atoi(need(argv[i])); + } else if (!std::strcmp(argv[i], "--gpu-count")) { + args.gpu_count = std::atoi(need(argv[i])); + } else if (!std::strcmp(argv[i], "--workspace-mb")) { + args.workspace_mb = static_cast(std::atoll(need(argv[i]))); + } else if (!std::strcmp(argv[i], "--fast-accum")) { + args.fast_accum = std::atoi(need(argv[i])); + } else if (!std::strcmp(argv[i], "--help") || !std::strcmp(argv[i], "-h")) { + std::puts("Usage: cublaslt_fp8_gemm_bench [--matrix-size N] [--warmup N] " + "[--iterations N] [--first-gpu N] [--gpu-count N] " + "[--workspace-mb N] [--fast-accum 0|1]"); + std::exit(0); + } else { + std::fprintf(stderr, "Unknown argument: %s\n", argv[i]); + std::exit(2); + } + } + return args; +} + +static double run_one_gpu(int gpu, const Args &args) { + CHECK_CUDA(cudaSetDevice(gpu)); + + const int64_t m = args.matrix_size; + const int64_t n = args.matrix_size; + const int64_t k = args.matrix_size; + const size_t a_elems = static_cast(m) * k; + const size_t b_elems = static_cast(k) * n; + const size_t d_elems = static_cast(m) * n; + + __nv_fp8_e4m3 *d_a = nullptr; + __nv_fp8_e4m3 *d_b = nullptr; + __nv_bfloat16 *d_d = nullptr; + void *workspace = nullptr; + float *d_scale_a = nullptr; + float *d_scale_b = nullptr; + const float scale = 1.0f; + const size_t workspace_bytes = args.workspace_mb * 1024ULL * 1024ULL; + + CHECK_CUDA(cudaMalloc(&d_a, a_elems * sizeof(__nv_fp8_e4m3))); + CHECK_CUDA(cudaMalloc(&d_b, b_elems * sizeof(__nv_fp8_e4m3))); + CHECK_CUDA(cudaMalloc(&d_d, d_elems * sizeof(__nv_bfloat16))); + CHECK_CUDA(cudaMalloc(&workspace, workspace_bytes)); + CHECK_CUDA(cudaMalloc(&d_scale_a, sizeof(float))); + CHECK_CUDA(cudaMalloc(&d_scale_b, sizeof(float))); + CHECK_CUDA(cudaMemcpy(d_scale_a, &scale, sizeof(scale), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(d_scale_b, &scale, sizeof(scale), cudaMemcpyHostToDevice)); + + const int threads = 256; + const int blocks = 4096; + fill_fp8<<>>(d_a, a_elems, 0.01f); + fill_fp8<<>>(d_b, b_elems, 0.01f); + CHECK_CUDA(cudaMemset(d_d, 0, d_elems * sizeof(__nv_bfloat16))); + CHECK_CUDA(cudaGetLastError()); + CHECK_CUDA(cudaDeviceSynchronize()); + + cublasLtHandle_t lt; + cublasLtMatmulDesc_t op_desc; + cublasLtMatrixLayout_t a_desc, b_desc, d_desc; + cublasLtMatmulPreference_t preference; + CHECK_CUBLAS(cublasLtCreate(<)); + CHECK_CUBLAS(cublasLtMatmulDescCreate(&op_desc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); + + // cuBLASLt FP8 kernels require TN format: A is transposed, B is non-transposed. + // With square GEMMs this keeps the benchmark FLOP count identical to the PDF + // acceptance shape while satisfying the library's FP8 kernel constraints. + cublasOperation_t transa = CUBLAS_OP_T; + cublasOperation_t transb = CUBLAS_OP_N; + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + op_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa))); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + op_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb))); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + op_desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &d_scale_a, + sizeof(d_scale_a))); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + op_desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &d_scale_b, + sizeof(d_scale_b))); + int8_t fast_accum = args.fast_accum ? 1 : 0; + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + op_desc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fast_accum, + sizeof(fast_accum))); + + CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&a_desc, CUDA_R_8F_E4M3, k, m, k)); + CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&b_desc, CUDA_R_8F_E4M3, k, n, k)); + CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&d_desc, CUDA_R_16BF, m, n, m)); + + CHECK_CUBLAS(cublasLtMatmulPreferenceCreate(&preference)); + CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( + preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_bytes, + sizeof(workspace_bytes))); + + cublasLtMatmulHeuristicResult_t heuristic; + int returned = 0; + CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( + lt, op_desc, a_desc, b_desc, d_desc, d_desc, preference, 1, &heuristic, + &returned)); + if (returned == 0) { + std::fprintf(stderr, "No cuBLASLt heuristic returned for GPU %d\n", gpu); + std::exit(1); + } + + auto get_algo_attr_i32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) { + int32_t value = -1; + size_t written = 0; + CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute( + &heuristic.algo, attr, &value, sizeof(value), &written)); + return static_cast(value); + }; + auto get_algo_attr_u32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) { + uint32_t value = 0; + size_t written = 0; + CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute( + &heuristic.algo, attr, &value, sizeof(value), &written)); + return static_cast(value); + }; + auto get_algo_attr_u16 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) { + uint16_t value = 0; + size_t written = 0; + CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute( + &heuristic.algo, attr, &value, sizeof(value), &written)); + return static_cast(value); + }; + const int algo_id = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_ID); + const int tile_id = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_TILE_ID); + const int splitk = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_SPLITK_NUM); + const int stages = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_STAGES_ID); + const int inner_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID); + const int cluster_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID); + + const float alpha = 1.0f; + const float beta = 0.0f; + auto matmul = [&]() { + CHECK_CUBLAS(cublasLtMatmul(lt, op_desc, &alpha, d_a, a_desc, d_b, b_desc, + &beta, d_d, d_desc, d_d, d_desc, + &heuristic.algo, workspace, workspace_bytes, 0)); + }; + + for (int i = 0; i < args.warmup; ++i) { + matmul(); + } + CHECK_CUDA(cudaDeviceSynchronize()); + + cudaEvent_t start, stop; + CHECK_CUDA(cudaEventCreate(&start)); + CHECK_CUDA(cudaEventCreate(&stop)); + CHECK_CUDA(cudaEventRecord(start)); + for (int i = 0; i < args.iterations; ++i) { + matmul(); + } + CHECK_CUDA(cudaEventRecord(stop)); + CHECK_CUDA(cudaEventSynchronize(stop)); + float elapsed_ms = 0.0f; + CHECK_CUDA(cudaEventElapsedTime(&elapsed_ms, start, stop)); + const double flops = + 2.0 * static_cast(m) * static_cast(n) * + static_cast(k) * static_cast(args.iterations); + const double tflops = flops / (static_cast(elapsed_ms) / 1000.0) / 1e12; + std::printf( + " {\"index\": %d, \"fp8_tflops\": %.1f, \"algo_id\": %d, " + "\"tile_id\": %d, \"splitk\": %d, \"stages_id\": %d, " + "\"inner_shape_id\": %d, \"cluster_shape_id\": %d}%s\n", + gpu, tflops, algo_id, tile_id, splitk, stages, inner_shape, cluster_shape, + (gpu + 1 == args.first_gpu + args.gpu_count) ? "" : ","); + std::fflush(stdout); + + CHECK_CUDA(cudaEventDestroy(start)); + CHECK_CUDA(cudaEventDestroy(stop)); + CHECK_CUBLAS(cublasLtMatmulPreferenceDestroy(preference)); + CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(a_desc)); + CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(b_desc)); + CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(d_desc)); + CHECK_CUBLAS(cublasLtMatmulDescDestroy(op_desc)); + CHECK_CUBLAS(cublasLtDestroy(lt)); + CHECK_CUDA(cudaFree(d_a)); + CHECK_CUDA(cudaFree(d_b)); + CHECK_CUDA(cudaFree(d_d)); + CHECK_CUDA(cudaFree(workspace)); + CHECK_CUDA(cudaFree(d_scale_a)); + CHECK_CUDA(cudaFree(d_scale_b)); + CHECK_CUDA(cudaDeviceSynchronize()); + + return tflops; +} + +int main(int argc, char **argv) { + Args args = parse_args(argc, argv); + int device_count = 0; + CHECK_CUDA(cudaGetDeviceCount(&device_count)); + if (args.gpu_count < 0) { + args.gpu_count = device_count - args.first_gpu; + } + if (args.first_gpu < 0 || args.first_gpu + args.gpu_count > device_count) { + std::fprintf(stderr, "Invalid GPU range first=%d count=%d device_count=%d\n", + args.first_gpu, args.gpu_count, device_count); + return 2; + } + + std::vector values; + std::printf("{\n"); + std::printf(" \"source\": \"cuBLASLt\",\n"); + std::printf(" \"dtype\": \"fp8_e4m3_inputs_bf16_output_fp32_accum\",\n"); + std::printf(" \"matrix_size\": %d,\n", args.matrix_size); + std::printf(" \"warmup\": %d,\n", args.warmup); + std::printf(" \"iterations\": %d,\n", args.iterations); + std::printf(" \"fast_accum\": %d,\n", args.fast_accum ? 1 : 0); + std::printf(" \"per_gpu\": [\n"); + for (int i = 0; i < args.gpu_count; ++i) { + int gpu = args.first_gpu + i; + double tflops = run_one_gpu(gpu, args); + values.push_back(tflops); + } + double mean = std::accumulate(values.begin(), values.end(), 0.0) / values.size(); + auto minmax = std::minmax_element(values.begin(), values.end()); + double spread = ((*minmax.second - *minmax.first) / mean) * 100.0; + std::printf(" ],\n"); + std::printf(" \"mean_tflops\": %.1f,\n", mean); + std::printf(" \"min_tflops\": %.1f,\n", *minmax.first); + std::printf(" \"max_tflops\": %.1f,\n", *minmax.second); + std::printf(" \"spread_pct\": %.2f\n", spread); + std::printf("}\n"); + return mean >= 1400.0 ? 0 : 1; +} diff --git a/scripts/pytorch_fp8_path_bench.py b/scripts/pytorch_fp8_path_bench.py new file mode 100755 index 0000000..ab35af8 --- /dev/null +++ b/scripts/pytorch_fp8_path_bench.py @@ -0,0 +1,277 @@ +#!/usr/bin/env python3 +"""Compare FP8 GEMM paths used for H100/H200 acceptance debugging. + +Paths: + A. torch._scaled_mm eager, default accumulation + B. torch._scaled_mm eager, use_fast_accum=True + C. CUDA Graph replay of torch._scaled_mm(out=..., use_fast_accum=True) + D. Transformer Engine Linear under fp8_autocast, when installed +""" + +from __future__ import annotations + +import argparse +import json +import statistics +import sys +import time +from typing import Any, Callable + +import torch + + +def tflops_from_ms(matrix_size: int, iterations: int, elapsed_ms: float) -> float: + flops = 2.0 * matrix_size * matrix_size * matrix_size * iterations + return flops / (elapsed_ms / 1000.0) / 1e12 + + +def cuda_event_bench( + name: str, + matrix_size: int, + iterations: int, + warmup: int, + func: Callable[[int], Any], +) -> dict[str, Any]: + for i in range(warmup): + func(i) + torch.cuda.synchronize() + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + wall_start = time.perf_counter() + start.record() + for i in range(iterations): + func(i) + end.record() + torch.cuda.synchronize() + wall_elapsed = time.perf_counter() - wall_start + elapsed_ms = start.elapsed_time(end) + return { + "name": name, + "status": "ok", + "matrix_size": matrix_size, + "iterations": iterations, + "warmup": warmup, + "event_ms_total": round(elapsed_ms, 3), + "event_us_per_iter": round(elapsed_ms * 1000.0 / iterations, 3), + "wall_ms_total": round(wall_elapsed * 1000.0, 3), + "tflops": round(tflops_from_ms(matrix_size, iterations, elapsed_ms), 1), + } + + +def make_fp8_inputs(matrix_size: int, pools: int, device: str) -> tuple[list[torch.Tensor], list[torch.Tensor]]: + a = [ + torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn) + for _ in range(pools) + ] + b = [ + torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn) + for _ in range(pools) + ] + torch.cuda.synchronize() + return a, b + + +def bench_scaled_mm(args: argparse.Namespace) -> list[dict[str, Any]]: + device = f"cuda:{args.gpu_index}" + torch.cuda.set_device(args.gpu_index) + scale_a = torch.tensor(1.0, device=device) + scale_b = torch.tensor(1.0, device=device) + pools_a, pools_b = make_fp8_inputs(args.matrix_size, args.pools, device) + results: list[dict[str, Any]] = [] + + def eager_default(i: int) -> torch.Tensor: + idx = i % args.pools + return torch._scaled_mm( + pools_a[idx], + pools_b[idx].T, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.bfloat16, + ) + + def eager_fast(i: int) -> torch.Tensor: + idx = i % args.pools + return torch._scaled_mm( + pools_a[idx], + pools_b[idx].T, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.bfloat16, + use_fast_accum=True, + ) + + results.append( + cuda_event_bench( + "A_eager_scaled_mm_default", + args.matrix_size, + args.iterations, + args.warmup, + eager_default, + ) + ) + results.append( + cuda_event_bench( + "B_eager_scaled_mm_fast_accum", + args.matrix_size, + args.iterations, + args.warmup, + eager_fast, + ) + ) + + graph_out = torch.empty( + (args.matrix_size, args.matrix_size), + device=device, + dtype=torch.bfloat16, + ) + static_a = pools_a[0] + static_b_t = pools_b[0].T + + try: + side_stream = torch.cuda.Stream() + side_stream.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(side_stream): + for _ in range(max(3, args.warmup // 2)): + torch._scaled_mm( + static_a, + static_b_t, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.bfloat16, + use_fast_accum=True, + out=graph_out, + ) + torch.cuda.current_stream().wait_stream(side_stream) + torch.cuda.synchronize() + + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph): + torch._scaled_mm( + static_a, + static_b_t, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.bfloat16, + use_fast_accum=True, + out=graph_out, + ) + + def graph_replay(_: int) -> None: + graph.replay() + + results.append( + cuda_event_bench( + "C_cuda_graph_scaled_mm_fast_accum", + args.matrix_size, + args.iterations, + 3, + graph_replay, + ) + ) + except Exception as exc: # noqa: BLE001 + results.append( + { + "name": "C_cuda_graph_scaled_mm_fast_accum", + "status": "unavailable", + "reason": f"{type(exc).__name__}: {exc}", + } + ) + + return results + + +def bench_transformer_engine(args: argparse.Namespace) -> dict[str, Any]: + try: + import transformer_engine.pytorch as te # type: ignore[import-not-found] + from transformer_engine.common.recipe import DelayedScaling, Format # type: ignore[import-not-found] + except Exception as exc: # noqa: BLE001 + return { + "name": "D_transformer_engine_fp8_linear", + "status": "unavailable", + "reason": f"{type(exc).__name__}: {exc}", + } + + device = f"cuda:{args.gpu_index}" + x = torch.randn(args.matrix_size, args.matrix_size, device=device, dtype=torch.bfloat16) + layer = te.Linear( + args.matrix_size, + args.matrix_size, + bias=False, + params_dtype=torch.bfloat16, + device=device, + ) + recipe = DelayedScaling(fp8_format=Format.HYBRID) + + def run(_: int) -> torch.Tensor: + with te.fp8_autocast(enabled=True, fp8_recipe=recipe): + return layer(x) + + try: + result = cuda_event_bench( + "D_transformer_engine_fp8_linear", + args.matrix_size, + args.iterations, + args.warmup, + run, + ) + except Exception as exc: # noqa: BLE001 + return { + "name": "D_transformer_engine_fp8_linear", + "status": "error", + "reason": f"{type(exc).__name__}: {exc}", + } + result["note"] = "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead." + return result + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--matrix-size", type=int, default=8192) + parser.add_argument("--warmup", type=int, default=20) + parser.add_argument("--iterations", type=int, default=100) + parser.add_argument("--gpu-index", type=int, default=0) + parser.add_argument("--pools", type=int, default=4) + args = parser.parse_args() + + if not torch.cuda.is_available(): + print(json.dumps({"error": "cuda unavailable"}, indent=2)) + return 1 + if not hasattr(torch, "_scaled_mm") or not hasattr(torch, "float8_e4m3fn"): + print(json.dumps({"error": "torch FP8 _scaled_mm unavailable"}, indent=2)) + return 1 + + torch.cuda.set_device(args.gpu_index) + props = torch.cuda.get_device_properties(args.gpu_index) + payload = { + "source": "pytorch_fp8_path_bench", + "torch": torch.__version__, + "cuda": torch.version.cuda, + "gpu_index": args.gpu_index, + "gpu_name": props.name, + "matrix_size": args.matrix_size, + "warmup": args.warmup, + "iterations": args.iterations, + "results": [], + } + try: + payload["results"].extend(bench_scaled_mm(args)) + payload["results"].append(bench_transformer_engine(args)) + except torch.cuda.OutOfMemoryError as exc: + payload["error"] = f"CUDA OOM: {exc}" + print(json.dumps(payload, indent=2)) + return 1 + + ok_values = [r["tflops"] for r in payload["results"] if r.get("status") == "ok"] + if ok_values: + payload["summary"] = { + "max_tflops": round(max(ok_values), 1), + "min_tflops": round(min(ok_values), 1), + "mean_tflops": round(statistics.mean(ok_values), 1), + } + print(json.dumps(payload, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/run_cublaslt_fp8_gemm.sh b/scripts/run_cublaslt_fp8_gemm.sh new file mode 100755 index 0000000..49f4787 --- /dev/null +++ b/scripts/run_cublaslt_fp8_gemm.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +set -uo pipefail + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)" + +CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}" +NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}" +OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}" +MATRIX_SIZE="${MATRIX_SIZE:-8192}" +WARMUP="${WARMUP:-20}" +ITERATIONS="${ITERATIONS:-200}" +GPU_COUNT="${GPU_COUNT:-8}" +FIRST_GPU="${FIRST_GPU:-0}" +WORKSPACE_MB="${WORKSPACE_MB:-256}" + +if [[ ! -x "$NVCC" ]]; then + echo "nvcc not found: $NVCC" >&2 + exit 1 +fi + +mkdir -p "$OUT_DIR" "$PROJECT_DIR/build" +HOST="$(hostname 2>/dev/null || echo unknown)" +TS="$(date +%Y%m%d_%H%M%S)" +BIN="$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" +REPORT="$OUT_DIR/cublaslt_fp8_gemm_${HOST}_${TS}.json" + +"$NVCC" -O3 -std=c++17 -arch=sm_90 \ + "$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \ + -lcublasLt -lcublas -o "$BIN" + +set +e +"$BIN" \ + --matrix-size "$MATRIX_SIZE" \ + --warmup "$WARMUP" \ + --iterations "$ITERATIONS" \ + --first-gpu "$FIRST_GPU" \ + --gpu-count "$GPU_COUNT" \ + --workspace-mb "$WORKSPACE_MB" \ + | tee "$REPORT" +status=${PIPESTATUS[0]} +set -e + +echo "Report written to: $REPORT" +exit "$status" diff --git a/scripts/run_fp8_path_comparison.sh b/scripts/run_fp8_path_comparison.sh new file mode 100755 index 0000000..46fd0e2 --- /dev/null +++ b/scripts/run_fp8_path_comparison.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)" + +PYTHON="${PYTHON:-/root/gpu-test-venv/bin/python}" +CUDA_HOME="${CUDA_HOME:-/usr/local/cuda-12.4}" +NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}" +OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}" +MATRIX_SIZE="${MATRIX_SIZE:-8192}" +WARMUP="${WARMUP:-20}" +ITERATIONS="${ITERATIONS:-100}" +GPU_INDEX="${GPU_INDEX:-0}" +WORKSPACE_MB="${WORKSPACE_MB:-256}" +VENV_SITE_PACKAGES="$("$PYTHON" - <<'PY' +import site +print(site.getsitepackages()[0]) +PY +)" +export LD_LIBRARY_PATH="$VENV_SITE_PACKAGES/nvidia/cudnn/lib:$VENV_SITE_PACKAGES/nvidia/nccl/lib:${LD_LIBRARY_PATH:-}" + +mkdir -p "$PROJECT_DIR/build" "$OUT_DIR" + +HOST="$(hostname 2>/dev/null || echo unknown)" +TS="$(date +%Y%m%d_%H%M%S)" +PY_REPORT="$OUT_DIR/fp8_paths_pytorch_${HOST}_${TS}.json" +CUBLAS_REPORT="$OUT_DIR/fp8_paths_cublaslt_${HOST}_${TS}.json" +COMBINED_REPORT="$OUT_DIR/fp8_paths_combined_${HOST}_${TS}.json" + +"$PYTHON" "$PROJECT_DIR/scripts/pytorch_fp8_path_bench.py" \ + --matrix-size "$MATRIX_SIZE" \ + --warmup "$WARMUP" \ + --iterations "$ITERATIONS" \ + --gpu-index "$GPU_INDEX" | tee "$PY_REPORT" + +"$NVCC" -O3 -std=c++17 -arch=sm_90 \ + "$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \ + -lcublasLt -lcublas -o "$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" + +"$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" \ + --matrix-size "$MATRIX_SIZE" \ + --warmup "$WARMUP" \ + --iterations "$ITERATIONS" \ + --first-gpu "$GPU_INDEX" \ + --gpu-count 1 \ + --workspace-mb "$WORKSPACE_MB" \ + --fast-accum 1 | tee "$CUBLAS_REPORT" + +"$PYTHON" - "$PY_REPORT" "$CUBLAS_REPORT" "$COMBINED_REPORT" <<'PY' +import json +import pathlib +import sys + +py_report = pathlib.Path(sys.argv[1]) +cublas_report = pathlib.Path(sys.argv[2]) +combined_report = pathlib.Path(sys.argv[3]) + +with py_report.open() as f: + py_payload = json.load(f) +with cublas_report.open() as f: + cublas_payload = json.load(f) + +combined = { + "source": "fp8_path_comparison", + "host": cublas_payload.get("host"), + "matrix_size": py_payload.get("matrix_size"), + "gpu_index": py_payload.get("gpu_index"), + "pytorch": py_payload, + "cublaslt": cublas_payload, + "results": [], +} +combined["results"].extend(py_payload.get("results", [])) +per_gpu = cublas_payload.get("per_gpu", []) +if per_gpu: + row = dict(per_gpu[0]) + row.update({ + "name": "E_direct_cublaslt_fast_accum", + "status": "ok", + "tflops": row.pop("fp8_tflops"), + "matrix_size": cublas_payload.get("matrix_size"), + "iterations": cublas_payload.get("iterations"), + "warmup": cublas_payload.get("warmup"), + "fast_accum": cublas_payload.get("fast_accum"), + "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager.", + }) + combined["results"].append(row) + +combined_report.write_text(json.dumps(combined, indent=2), encoding="utf-8") +print(f"Combined report written to: {combined_report}") +PY + +echo "$COMBINED_REPORT"