From 6c9f049b71c39f95bb1ddd5268e323c8fa133494 Mon Sep 17 00:00:00 2001 From: cs Date: Sat, 23 May 2026 16:12:32 +0800 Subject: [PATCH] Tune multinode NCCL auto parameters --- configs/default.yaml | 6 +- configs/multinode_nccl_diagnostic.yaml | 6 +- configs/multinode_nccl_nccl227_16g.yaml | 6 +- configs/multinode_nccl_nccl227_auto_16g.yaml | 62 +++++++++++++++++ .../multinode_nccl_nccl227_diagnostic.yaml | 6 +- configs/multinode_nccl_nccl227_sweep.yaml | 6 +- ...rts_multinode_nccl_16g_2x8_nccl227_auto.md | 66 +++++++++++++++++++ reports_multinode_nccl_diagnosis_20260523.md | 54 ++++++++++++--- 8 files changed, 187 insertions(+), 25 deletions(-) create mode 100644 configs/multinode_nccl_nccl227_auto_16g.yaml create mode 100644 reports_multinode_nccl_16g_2x8_nccl227_auto.md diff --git a/configs/default.yaml b/configs/default.yaml index b3956a4..cd214e4 100644 --- a/configs/default.yaml +++ b/configs/default.yaml @@ -87,11 +87,11 @@ multinode_nccl: ib_tc: 136 ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 ib_timeout: 22 - qps_per_connection: 4 - min_nchannels: 4 + qps_per_connection: null + min_nchannels: null net_plugin: none nvls_enable: 1 - split_data_on_qps: 1 + split_data_on_qps: null extra_env: {} min_peak_busbw_gbps: allreduce: 480 diff --git a/configs/multinode_nccl_diagnostic.yaml b/configs/multinode_nccl_diagnostic.yaml index 3741b37..0e6479d 100644 --- a/configs/multinode_nccl_diagnostic.yaml +++ b/configs/multinode_nccl_diagnostic.yaml @@ -47,11 +47,11 @@ multinode_nccl: ib_tc: 136 ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 ib_timeout: 22 - qps_per_connection: 4 - min_nchannels: 4 + qps_per_connection: null + min_nchannels: null net_plugin: none nvls_enable: 1 - split_data_on_qps: 1 + split_data_on_qps: null extra_env: NCCL_DEBUG_SUBSYS: INIT,NET NCCL_NET_GDR_LEVEL: 5 diff --git a/configs/multinode_nccl_nccl227_16g.yaml b/configs/multinode_nccl_nccl227_16g.yaml index e7b718f..c5552fe 100644 --- a/configs/multinode_nccl_nccl227_16g.yaml +++ b/configs/multinode_nccl_nccl227_16g.yaml @@ -47,11 +47,11 @@ multinode_nccl: ib_tc: 136 ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 ib_timeout: 22 - qps_per_connection: 4 - min_nchannels: 4 + qps_per_connection: null + min_nchannels: null net_plugin: none nvls_enable: 1 - split_data_on_qps: 1 + split_data_on_qps: null extra_env: NCCL_DEBUG_SUBSYS: INIT,NET NCCL_NET_GDR_LEVEL: 5 diff --git a/configs/multinode_nccl_nccl227_auto_16g.yaml b/configs/multinode_nccl_nccl227_auto_16g.yaml new file mode 100644 index 0000000..2492989 --- /dev/null +++ b/configs/multinode_nccl_nccl227_auto_16g.yaml @@ -0,0 +1,62 @@ +tools: + install_dir: /opt/gpu-test-tools + +report: + output_dir: ./reports + format: md + +multinode_nccl: + enabled: true + mode: large-message-nccl-2.27.7-auto + hosts: + - name: nccl-gpu-1 + addr: 172.72.8.12 + slots: 8 + - name: nccl-gpu-2 + addr: 172.72.8.16 + slots: 8 + ssh_user: root + ssh_preflight: true + mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun + mpi_ld_preload: null + extra_ld_library_path: + - /usr/mpi/gcc/openmpi-4.1.9a1/lib + - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu + - /usr/local/cuda-12.4/targets/x86_64-linux/lib + nccl_tests_dir: null + tests: + - all_reduce_perf + - alltoall_perf + topologies: + - nodes: 2 + gpus_per_node: 8 + label: 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G + begin_size: 16G + end_size: 16G + step_factor: 2 + warmup_iters: 1 + iters: 3 + gpus_per_rank: 1 + timeout_sec: 1200 + debug: INFO + socket_ifname: bond0 + oob_tcp_ifname: bond0 + plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30" + ib_gid_index: 3 + ib_sl: 5 + ib_tc: 136 + ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 + ib_timeout: 22 + qps_per_connection: null + min_nchannels: null + net_plugin: none + nvls_enable: 1 + split_data_on_qps: null + extra_env: + NCCL_DEBUG_SUBSYS: INIT,NET + NCCL_NET_GDR_LEVEL: 5 + NCCL_NET_GDR_READ: 1 + NCCL_DMABUF_ENABLE: 0 + min_peak_busbw_gbps: + allreduce: 480 + alltoall: 75 diff --git a/configs/multinode_nccl_nccl227_diagnostic.yaml b/configs/multinode_nccl_nccl227_diagnostic.yaml index 8a769ad..5465772 100644 --- a/configs/multinode_nccl_nccl227_diagnostic.yaml +++ b/configs/multinode_nccl_nccl227_diagnostic.yaml @@ -47,11 +47,11 @@ multinode_nccl: ib_tc: 136 ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 ib_timeout: 22 - qps_per_connection: 4 - min_nchannels: 4 + qps_per_connection: null + min_nchannels: null net_plugin: none nvls_enable: 1 - split_data_on_qps: 1 + split_data_on_qps: null extra_env: NCCL_DEBUG_SUBSYS: INIT,NET NCCL_NET_GDR_LEVEL: 5 diff --git a/configs/multinode_nccl_nccl227_sweep.yaml b/configs/multinode_nccl_nccl227_sweep.yaml index 3dcbf36..da96ef1 100644 --- a/configs/multinode_nccl_nccl227_sweep.yaml +++ b/configs/multinode_nccl_nccl227_sweep.yaml @@ -47,11 +47,11 @@ multinode_nccl: ib_tc: 136 ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 ib_timeout: 22 - qps_per_connection: 4 - min_nchannels: 4 + qps_per_connection: null + min_nchannels: null net_plugin: none nvls_enable: 1 - split_data_on_qps: 1 + split_data_on_qps: null extra_env: NCCL_DEBUG_SUBSYS: INIT,NET NCCL_NET_GDR_LEVEL: 5 diff --git a/reports_multinode_nccl_16g_2x8_nccl227_auto.md b/reports_multinode_nccl_16g_2x8_nccl227_auto.md new file mode 100644 index 0000000..0481813 --- /dev/null +++ b/reports_multinode_nccl_16g_2x8_nccl227_auto.md @@ -0,0 +1,66 @@ +# GPU Test Report + +- **Date:** 2026-05-23T08:09:56.340954 +- **Host:** aikubeworker0012 + +## Overall Acceptance Verdict + +**Result: FAIL** + +Missing required evidence: +- GPU Info +- Health Check +- Memory Bandwidth +- Compute Throughput +- NVLink/NVSwitch +- NCCL +- Stress Test +- RDMA +- DCGM +- Training + +## Summary + +| Test | Result | +|------|--------| +| Multi-node NCCL | FAIL | + +## Multi-node NCCL / Cross Leaf + +Source: nccl-tests-mpirun | Mode: large-message-nccl-2.27.7-auto + +- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16) +- **Preflight:** PASS + +### Multi-node NCCL allreduce + +| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status | +|----------|-------------|-----------|------------|-----------|--------| +| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | 354.60 GB/s | 16G | 354.57 GB/s | >= 480 GB/s | FAIL | + +| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs | +|----------|--------------|-----------------|------------------|-------------------| +| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - | + +| Topology | Return Code | Error / Output Tail | +|----------|-------------|---------------------| +| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | 0 | 0012:2149404:2149572 [7] NCCL INFO comm 0x560bd3541a30 rank 7 nranks 16 cudaDev 7 busId db000 - Destroy COMPLETE aikubeworker0016:1066162:1066981 [5] NCCL INFO comm 0x55e73208e200 rank 13 nranks 16 cudaDev 5 busId ab000 - Destroy COMPLETE | + +### Multi-node NCCL alltoall + +| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status | +|----------|-------------|-----------|------------|-----------|--------| +| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | 30.01 GB/s | 16G | 30.02 GB/s | >= 75 GB/s | FAIL | + +| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs | +|----------|--------------|-----------------|------------------|-------------------| +| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - | + +| Topology | Return Code | Error / Output Tail | +|----------|-------------|---------------------| +| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | 0 | r0012:2149589:2149764 [7] NCCL INFO comm 0x55fef234b7c0 rank 7 nranks 16 cudaDev 7 busId db000 - Destroy COMPLETE aikubeworker0012:2149588:2149765 [6] NCCL INFO comm 0x5637718f1dd0 rank 6 nranks 16 cudaDev 6 busId ba000 - Destroy COMPLETE | + +**Overall: FAIL** + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_multinode_nccl_diagnosis_20260523.md b/reports_multinode_nccl_diagnosis_20260523.md index bc20b72..79325a3 100644 --- a/reports_multinode_nccl_diagnosis_20260523.md +++ b/reports_multinode_nccl_diagnosis_20260523.md @@ -3,14 +3,16 @@ - 日期:2026-05-23 - 测试入口:`nccl-gpu-1` / `aikubeworker0012` / `172.72.8.12` - 对端节点:`nccl-gpu-2` / `aikubeworker0016` / `172.72.8.16` -- 诊断配置:`configs/multinode_nccl_diagnostic.yaml` -- 原始脚本报告:`reports_multinode_nccl_diagnostic_2x8_sshfix.md` +- 诊断配置:`configs/multinode_nccl_nccl227_auto_16g.yaml` +- 当前最佳原始脚本报告:`reports_multinode_nccl_16g_2x8_nccl227_auto.md` ## 当前结论 这不是单纯 “IB 不通” 的问题。底层 CUDA RDMA perftest 可以跑到接近单端口 400Gb/s 的水平;最初使用 pip 包里的 NCCL 2.21.5 时,NCCL 在实际 2 节点通信中把 GPU Direct RDMA 禁用了,导致带宽显著偏低。 -后续临时切换到 apt 包解压出的 NCCL 2.27.7+cuda12.4 后,NCCL GDR 已经恢复启用,2 节点 x 8 GPU allreduce 从 `67.42 GB/s` 提升到 `237.86 GB/s`,alltoall 从 `9.56 GB/s` 提升到 `28.62 GB/s`。当前剩余问题不再是 GDR disabled,而是 GDR enabled 后仍低于当前配置里的验收阈值。 +后续临时切换到 apt 包解压出的 NCCL 2.27.7+cuda12.4 后,NCCL GDR 已经恢复启用,2 节点 x 8 GPU allreduce 从 `67.42 GB/s` 提升到 `237.86 GB/s`,alltoall 从 `9.56 GB/s` 提升到 `28.62 GB/s`。 + +继续 tuning 后发现,配置里固定的 `NCCL_MIN_NCHANNELS=4`、`NCCL_IB_QPS_PER_CONNECTION=4`、`NCCL_IB_SPLIT_DATA_ON_QPS=1` 会明显压低 16G allreduce。去掉这些固定参数、让 NCCL 2.27.7 自动选择后,正式脚本报告中 2 节点 x 8 GPU allreduce 提升到 `354.60 GB/s`,alltoall 小幅提升到 `30.01 GB/s`。当前剩余问题不再是 GDR disabled,而是 GDR enabled 且 NCCL 自动调参后,仍低于当前配置里的验收阈值。 同时,`nccl-gpu-2` 的 SSH 入口曾因未认证连接过多触发 `MaxStartups` 随机拒绝,导致 `mpirun` 拉起远端 rank 失败。已经做了临时 SSHD 缓解并拿到有效的 2 节点 x 8 GPU allreduce/alltoall 报告。 @@ -26,6 +28,7 @@ 8. 将 OpenMPI OOB TCP 控制通道固定到 `bond0`,并加入 `plm_rsh_args`,减少 `mpirun` 远端启动受 SSH/host key/接口选择影响的概率。 9. 从 NVIDIA apt 源下载但不安装 `libnccl2=2.27.7-1+cuda12.4`,解压到两台机器 `/tmp/nccl-2.27.7-cuda12.4`,用 `LD_LIBRARY_PATH` 临时覆盖 NCCL 运行库验证。 10. 增强报告解析,能够区分 `GPU Direct RDMA ENABLED` 和 `DISABLED`,并列出 enabled/disabled HCA。 +11. 将 multi-node NCCL 配置中的 `qps_per_connection`、`min_nchannels`、`split_data_on_qps` 改为 `null`,避免默认导出会压低大包 allreduce 的固定 NCCL 参数。 ## 关键证据 @@ -141,7 +144,35 @@ allreduce 和 alltoall 本轮均正常完成,`returncode=0`、`wrong=0`,失 | allreduce | `237.86 GB/s` | `16G` | `>= 480 GB/s` | FAIL | ENABLED | | alltoall | `28.62 GB/s` | `16G` | `>= 75 GB/s` | FAIL | ENABLED | -解释:NCCL 2.27.7 已经修复 GDR 禁用问题,且性能提升明显;但在当前跨节点/跨 Leaf 环境和当前阈值下仍不达标。allreduce 约稳定在 `238 GB/s`,alltoall 约稳定在 `28-29 GB/s`。 +解释:NCCL 2.27.7 已经修复 GDR 禁用问题,且性能提升明显;但在固定 `min_nchannels=4/qps=4/split=1` 的配置下仍不达标。allreduce 约稳定在 `238 GB/s`,alltoall 约稳定在 `28-29 GB/s`。 + +### 4.2 NCCL 2.27.7 自动通道/QP 参数结果 + +进一步对 16G 大包做 tuning,发现默认配置里锁定的参数会压低 allreduce: + +| 配置 | allreduce Avg Bus BW | alltoall Avg Bus BW | 结论 | +|------|----------------------|---------------------|------| +| NCCL 2.27.7 + 固定 `min_nchannels=4/qps=4/split=1` | `238.56 GB/s` | `28.62 GB/s` | GDR 已启用,但 allreduce 被压低 | +| NCCL 2.27.7 + NCCL 自动选择 channel/QP | `354.57 GB/s` | `30.02 GB/s` | 当前最佳脚本结果 | + +正式脚本报告:`reports_multinode_nccl_16g_2x8_nccl227_auto.md` + +| Operation | Peak Bus BW | Avg Bus BW | Peak Size | Threshold | Status | GPU Direct RDMA | +|-----------|-------------|------------|-----------|-----------|--------|-----------------| +| allreduce | `354.60 GB/s` | `354.57 GB/s` | `16G` | `>= 480 GB/s` | FAIL | ENABLED | +| alltoall | `30.01 GB/s` | `30.02 GB/s` | `16G` | `>= 75 GB/s` | FAIL | ENABLED | + +对比临时 tuning 命令: + +| 变量组合 | allreduce Avg Bus BW | alltoall Avg Bus BW | +|----------|----------------------|---------------------| +| baseline auto | `353.63 GB/s` | `30.05 GB/s` | +| `NCCL_IB_MERGE_NICS=1` | `352.73 GB/s` | `30.07 GB/s` | +| `NCCL_CROSS_NIC=1` | `354.68 GB/s` | `30.05 GB/s` | +| `NCCL_IB_QPS_PER_CONNECTION=8` + `NCCL_IB_SPLIT_DATA_ON_QPS=0` | `350.91 GB/s` | `29.41 GB/s` | +| `NCCL_MIN_NCHANNELS=16` + `NCCL_MAX_NCHANNELS=16` | `354.32 GB/s` | `30.06 GB/s` | + +解释:allreduce 的主要提升来自取消不合适的固定参数,而不是 `MERGE_NICS` 或 `CROSS_NIC`。alltoall 对这些参数不敏感,当前基本稳定在 `30 GB/s` 左右。 ### 5. SSHD MaxStartups 阻塞已临时缓解 @@ -205,12 +236,12 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0' 判断:底层 RDMA 能力存在,GDR 禁用主要由旧 NCCL 版本触发。建议正式安装并固定 NCCL 2.27.7+cuda12.4 或更新的已验证版本。 -### 阻塞 2:GDR enabled 后带宽仍低于当前阈值 +### 阻塞 2:GDR enabled 且 NCCL 自动调参后带宽仍低于当前阈值 现象: -- 2x8 16G allreduce:`237.86 GB/s`,阈值 `>= 480 GB/s` -- 2x8 16G alltoall:`28.62 GB/s`,阈值 `>= 75 GB/s` +- 2x8 16G allreduce:`354.60 GB/s`,阈值 `>= 480 GB/s` +- 2x8 16G alltoall:`30.01 GB/s`,阈值 `>= 75 GB/s` - 已使用 4 个 400Gb/s HCA:`mlx5_0, mlx5_1, mlx5_6, mlx5_7` 判断:需要确认当前 PDF/config 阈值是否适用于跨 Leaf 两节点场景;如果阈值确实要求跨 Leaf 也达到这些数值,则还需要继续查链路聚合、多 rail 使用、交换网络、NCCL net plugin/SHARP 或 rail mapping。 @@ -230,9 +261,10 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0' 1. 从网络/安全侧处理 `172.239.10.85` 等来源的 SSH 未认证连接压力,或者保留更高的 `MaxStartups` 配置作为测试窗口临时策略。 2. 正式安装并固定已验证的 NCCL 2.27.7+cuda12.4 或更新版本,不要依赖 pip NCCL 2.21.5;当前 `/tmp/nccl-2.27.7-cuda12.4` 只是临时解压验证。 -3. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin/SHARP;当前日志显示 `Could not find: libnccl-net.so`,NCCL 使用的是 internal IB plugin。 -4. 核对跨 Leaf 链路的 rail mapping、交换机端口速率、路由和拥塞计数,确认 4 个 400Gb/s HCA 是否都在跨节点通信中充分利用。 -5. 确认当前 `allreduce >= 480 GB/s`、`alltoall >= 75 GB/s` 阈值是否应直接用于跨 Leaf 两节点场景;如果是,继续按链路和 NCCL rail 聚合方向排查。 +3. multi-node NCCL 默认不要固定 `NCCL_MIN_NCHANNELS=4`、`NCCL_IB_QPS_PER_CONNECTION=4`、`NCCL_IB_SPLIT_DATA_ON_QPS=1`;当前脚本配置已改成 `null`,让 NCCL 自动选择。 +4. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin/SHARP;当前日志显示 `Could not find: libnccl-net.so`,NCCL 使用的是 internal IB plugin。 +5. 核对跨 Leaf 链路的 rail mapping、交换机端口速率、路由和拥塞计数,确认 4 个 400Gb/s HCA 是否都在跨节点通信中充分利用。 +6. 确认当前 `allreduce >= 480 GB/s`、`alltoall >= 75 GB/s` 阈值是否应直接用于跨 Leaf 两节点场景;如果是,继续按链路和 NCCL rail 聚合方向排查。 ## 当前可交付物 @@ -240,8 +272,10 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0' - `configs/multinode_nccl_nccl227_diagnostic.yaml`:NCCL 2.27.7 256M 诊断配置 - `configs/multinode_nccl_nccl227_sweep.yaml`:NCCL 2.27.7 1M 到 4G sweep 配置 - `configs/multinode_nccl_nccl227_16g.yaml`:NCCL 2.27.7 16G 大包配置 +- `configs/multinode_nccl_nccl227_auto_16g.yaml`:NCCL 2.27.7 16G 自动 channel/QP 配置 - `reports_multinode_nccl_diagnostic_2x8_sshfix.md`:脚本生成的原始 2x8 诊断报告 - `reports_multinode_nccl_diagnostic_2x8_nccl227_v2.md`:NCCL 2.27.7 256M 诊断报告 - `reports_multinode_nccl_sweep_2x8_nccl227.md`:NCCL 2.27.7 1M 到 4G sweep 报告 - `reports_multinode_nccl_16g_2x8_nccl227.md`:NCCL 2.27.7 16G 大包报告 +- `reports_multinode_nccl_16g_2x8_nccl227_auto.md`:NCCL 2.27.7 16G 自动 channel/QP 原始报告 - `reports_multinode_nccl_diagnosis_20260523.md`:本中文诊断总结