diff --git a/configs/default.yaml b/configs/default.yaml index 09a3921..7951089 100644 --- a/configs/default.yaml +++ b/configs/default.yaml @@ -90,6 +90,7 @@ multinode_nccl: net_plugin: none nvls_enable: 1 split_data_on_qps: 1 + extra_env: {} min_peak_busbw_gbps: allreduce: 480 alltoall: 75 diff --git a/configs/multinode_nccl_diagnostic.yaml b/configs/multinode_nccl_diagnostic.yaml new file mode 100644 index 0000000..6afdc19 --- /dev/null +++ b/configs/multinode_nccl_diagnostic.yaml @@ -0,0 +1,60 @@ +tools: + install_dir: /opt/gpu-test-tools + +report: + output_dir: ./reports + format: md + +multinode_nccl: + enabled: true + mode: diagnostic + hosts: + - name: nccl-gpu-1 + addr: 172.72.8.12 + slots: 8 + - name: nccl-gpu-2 + addr: 172.72.8.16 + slots: 8 + ssh_user: root + ssh_preflight: true + mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun + mpi_ld_preload: null + extra_ld_library_path: + - /usr/mpi/gcc/openmpi-4.1.9a1/lib + - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib + - /usr/local/cuda-12.4/targets/x86_64-linux/lib + nccl_tests_dir: null + tests: + - all_reduce_perf + - alltoall_perf + topologies: + - nodes: 2 + gpus_per_node: 8 + label: 2 nodes x 8 GPUs diagnostic + begin_size: 256M + end_size: 256M + step_factor: 2 + warmup_iters: 1 + iters: 3 + gpus_per_rank: 1 + timeout_sec: 600 + debug: INFO + socket_ifname: bond0 + ib_gid_index: 3 + ib_sl: 5 + ib_tc: 136 + ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 + ib_timeout: 22 + qps_per_connection: 4 + min_nchannels: 4 + net_plugin: none + nvls_enable: 1 + split_data_on_qps: 1 + extra_env: + NCCL_DEBUG_SUBSYS: INIT,NET + NCCL_NET_GDR_LEVEL: 5 + NCCL_NET_GDR_READ: 1 + NCCL_DMABUF_ENABLE: 0 + min_peak_busbw_gbps: + allreduce: 480 + alltoall: 75 diff --git a/modules/report.py b/modules/report.py index b82170b..c9e1b8d 100644 --- a/modules/report.py +++ b/modules/report.py @@ -492,6 +492,29 @@ class ReportGenerator: f"{threshold_text} | {topo.get('status', '?')} |" ) lines.append("") + diag_rows = [] + for topo in data.get("topologies", []): + net = topo.get("network") or {} + if net: + diag_rows.append((topo, net)) + if diag_rows: + lines.append("| Topology | NCCL Network | GPU Direct RDMA | GDR Disabled HCAs |") + lines.append("|----------|--------------|-----------------|-------------------|") + for topo, net in diag_rows: + networks = ", ".join(net.get("networks") or []) or "unknown" + gdr = net.get("gpu_direct_rdma", "UNKNOWN") + disabled = ", ".join(net.get("gdr_disabled_hcas") or []) or "-" + lines.append(f"| {topo.get('label', '')} | {networks} | {gdr} | {disabled} |") + lines.append("") + failed_topos = [topo for topo in data.get("topologies", []) if topo.get("status") == "FAIL"] + if failed_topos: + lines.append("| Topology | Return Code | Error / Output Tail |") + lines.append("|----------|-------------|---------------------|") + for topo in failed_topos: + tail = topo.get("error") or topo.get("stderr_tail") or topo.get("stdout_tail") or "" + tail = str(tail).replace("\n", " ").replace("|", "\\|")[-240:] + lines.append(f"| {topo.get('label', '')} | {topo.get('returncode', '')} | {tail} |") + lines.append("") lines.append(f"**Overall: {'PASS' if multinode.get('passed') else 'FAIL'}**\n") elif multinode and multinode.get("error"): lines.append("## Multi-node NCCL / Cross Leaf\n") diff --git a/reports_multinode_nccl_diagnosis_20260523.md b/reports_multinode_nccl_diagnosis_20260523.md new file mode 100644 index 0000000..37cb75e --- /dev/null +++ b/reports_multinode_nccl_diagnosis_20260523.md @@ -0,0 +1,134 @@ +# 多机多卡 NCCL 诊断报告 + +- 日期:2026-05-23 +- 测试入口:`nccl-gpu-1` / `aikubeworker0012` / `172.72.8.12` +- 对端节点:`nccl-gpu-2` / `aikubeworker0016` / `172.72.8.16` +- 诊断配置:`configs/multinode_nccl_diagnostic.yaml` +- 原始脚本报告:`reports_multinode_nccl_diagnostic_2x8_debug_v2.md` + +## 当前结论 + +这不是单纯 “IB 不通” 的问题。底层 CUDA RDMA perftest 可以跑到接近单端口 400Gb/s 的水平,但 NCCL 在实际 2 节点通信时把 GPU Direct RDMA 禁用了,导致 NCCL 带宽显著低于验收阈值。 + +同时,`nccl-gpu-2` 的 SSH 入口不稳定,会造成 `mpirun` 拉起远端 rank 失败。这个问题会直接影响 alltoall 等多机测试的稳定性,需要和 NCCL GDR 问题一起处理。 + +## 已完成的修正 + +1. 修正 `mpirun` 使用路径,避开系统 `/usr/bin/mpirun` 与 DOCA OpenMPI 动态库混用导致的崩溃。 +2. 补充 `LD_LIBRARY_PATH`,确保 `mpirun`、CUDA、pip 安装的 NCCL 动态库可同时解析。 +3. 将 NCCL HCA 限定到 400Gb/s 活跃端口:`mlx5_0,mlx5_1,mlx5_6,mlx5_7`。 +4. 在脚本中加入 multi-node NCCL 网络诊断解析,报告会展示 `NCCL Network`、`GPU Direct RDMA`、`GDR Disabled HCAs`。 +5. 增加 `multinode_nccl.extra_env`,可以在配置里快速试 NCCL 环境变量,不需要改代码。 +6. 增加诊断配置 `configs/multinode_nccl_diagnostic.yaml`,固定跑 2 节点 x 8 GPU、256M、`NCCL_DEBUG=INFO` 和 `NCCL_DEBUG_SUBSYS=INIT,NET`。 + +## 关键证据 + +### 1. CUDA RDMA perftest 通过 + +命令类型: + +```bash +CUDA_VISIBLE_DEVICES=0 ib_write_bw -d mlx5_0 -i 1 --use_cuda=0 -s 4194304 -F --report_gbits 172.72.8.16 +``` + +结果: + +| 测试 | 设备 | GPU | 平均带宽 | 结论 | +|------|------|-----|----------|------| +| `ib_write_bw --use_cuda` | `mlx5_0` | GPU0 | `387.16 Gb/s` | PASS | + +解释:GPU 内存参与 RDMA 写带宽测试可以接近 400Gb/s,说明 `nvidia_peermem`/经典 GPUDirect RDMA 路径并非完全不可用。 + +### 2. CUDA DMA-BUF 路径不可用 + +命令类型: + +```bash +CUDA_VISIBLE_DEVICES=0 ib_write_bw -d mlx5_0 -i 1 --use_cuda=0 --use_cuda_dmabuf -s 4194304 -F --report_gbits 172.72.8.16 +``` + +结果: + +| 测试 | 输出 | 结论 | +|------|------|------| +| `ib_write_bw --use_cuda_dmabuf` | `DMA-BUF is not supported on this GPU` | FAIL | + +解释:当前环境不能走 CUDA DMA-BUF RDMA。后续 NCCL 应优先确认是否能稳定走经典 `nvidia_peermem` 路径。 + +### 3. NCCL 单卡跨节点仍禁用 GDR + +已经尝试: + +- `NCCL_NET_GDR_LEVEL=SYS` +- `NCCL_NET_GDR_LEVEL=5` +- `NCCL_NET_GDR_READ=1` +- `NCCL_DMABUF_ENABLE=0` +- `NCCL_IB_CUDA_SUPPORT=1` +- `NCCL_IB_HCA=mlx5_0` + +结果仍显示: + +```text +NCCL INFO Using network IB +NCCL INFO NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0' +``` + +256M allreduce 约 `13.4 GB/s`,明显低于 400Gb/s IB 端口能力。 + +### 4. 脚本 2 节点 x 8 GPU 诊断结果 + +原始报告:`reports_multinode_nccl_diagnostic_2x8_debug_v2.md` + +| Operation | Topology | Peak Bus BW | Threshold | Status | NCCL Network | GPU Direct RDMA | +|-----------|----------|-------------|-----------|--------|--------------|-----------------| +| allreduce | 2 nodes x 8 GPUs | `68.69 GB/s` | `>= 480 GB/s` | FAIL | IB | DISABLED | +| alltoall | 2 nodes x 8 GPUs | `0.00 GB/s` | `>= 75 GB/s` | FAIL | unknown | UNKNOWN | + +allreduce 失败原因是带宽不达标,且报告捕获到 GDR 被 NCCL 禁用: + +| GDR Disabled HCAs | +|-------------------| +| `mlx5_0, mlx5_1, mlx5_6, mlx5_7` | + +alltoall 失败原因这轮不是性能本身,而是 `mpirun` 阶段受 SSH/网络发现影响失败,报告尾部显示: + +```text +lack of common network interfaces and/or no route found between them +``` + +## 当前阻塞 + +### 阻塞 1:NCCL 禁用 GPU Direct RDMA + +现象: + +- IB 能被 NCCL 识别:`Using network IB` +- 400Gb/s HCA 被 NCCL 选中:`mlx5_0, mlx5_1, mlx5_6, mlx5_7` +- 但 NCCL 明确禁用 GDR:`GPU Direct RDMA Disabled` +- perftest 的经典 CUDA RDMA 又能跑到 `387.16 Gb/s` + +判断:底层 RDMA 能力存在,但 NCCL 的 GDR 判定/注册路径没有打通。优先排查 NCCL 与 NVIDIA driver、OFED、`nvidia_peermem`、NCCL net plugin/内部 IB 后端之间的兼容性。 + +### 阻塞 2:`nccl-gpu-2` SSH 不稳定 + +现象: + +- 多次出现:`kex_exchange_identification: Connection closed by remote host` +- MCP 直连 `nccl-gpu-2` 也会失败或长时间超时 +- `mpirun` 依赖 SSH 拉起远端 rank,因此 SSH 抖动会让 alltoall 这类测试直接没有有效输出 + +判断:需要先处理 `aikubeworker0016` 的 SSHD/连接限制/MaxStartups/安全策略,否则多机测试无法稳定复现。 + +## 建议下一步 + +1. 先修 `nccl-gpu-2` SSH 稳定性:检查 `sshd_config` 的 `MaxStartups`、连接限制、安全审计组件,以及是否有过多半开 SSH 会话。 +2. 对两台机器分别确认 `nvidia_peermem` 参数、OFED 版本、NVIDIA driver 版本一致性。 +3. 在两台机器上测试是否需要切换 `nvidia_peermem peerdirect_support` 模式,并在变更前确认没有正在运行的业务任务。 +4. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin;当前日志显示 `No plugin found (libnccl-net.so)`,NCCL 使用的是 internal network plugin。 +5. SSH 稳定后重跑完整多机配置:2 节点 x 8 GPU,至少覆盖 `all_reduce_perf` 和 `alltoall_perf`,消息大小从 `1K` 到 `16G`。 + +## 当前可交付物 + +- `configs/multinode_nccl_diagnostic.yaml`:多机多卡诊断配置 +- `reports_multinode_nccl_diagnostic_2x8_debug_v2.md`:脚本生成的原始 2x8 诊断报告 +- `reports_multinode_nccl_diagnosis_20260523.md`:本中文诊断总结 diff --git a/reports_multinode_nccl_diagnostic_2x8_debug_v2.md b/reports_multinode_nccl_diagnostic_2x8_debug_v2.md new file mode 100644 index 0000000..2076245 --- /dev/null +++ b/reports_multinode_nccl_diagnostic_2x8_debug_v2.md @@ -0,0 +1,66 @@ +# GPU Test Report + +- **Date:** 2026-05-23T07:37:41.426792 +- **Host:** aikubeworker0012 + +## Overall Acceptance Verdict + +**Result: FAIL** + +Missing required evidence: +- GPU Info +- Health Check +- Memory Bandwidth +- Compute Throughput +- NVLink/NVSwitch +- NCCL +- Stress Test +- RDMA +- DCGM +- Training + +## Summary + +| Test | Result | +|------|--------| +| Multi-node NCCL | FAIL | + +## Multi-node NCCL / Cross Leaf + +Source: nccl-tests-mpirun | Mode: diagnostic + +- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16) +- **Preflight:** PASS (1 warnings) + +### Multi-node NCCL allreduce + +| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status | +|----------|-------------|-----------|------------|-----------|--------| +| 2 nodes x 8 GPUs diagnostic | 68.69 GB/s | 256M | 68.21 GB/s | >= 480 GB/s | FAIL | + +| Topology | NCCL Network | GPU Direct RDMA | GDR Disabled HCAs | +|----------|--------------|-----------------|-------------------| +| 2 nodes x 8 GPUs diagnostic | IB | DISABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | + +| Topology | Return Code | Error / Output Tail | +|----------|-------------|---------------------| +| 2 nodes x 8 GPUs diagnostic | 0 | aikubeworker0012:2139504:2139504 [0] NCCL INFO comm 0x55646d15f590 rank 0 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth : 68.2135 # # Collective test concluded: all_reduce_perf # | + +### Multi-node NCCL alltoall + +| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status | +|----------|-------------|-----------|------------|-----------|--------| +| 2 nodes x 8 GPUs diagnostic | 0.00 GB/s | | 0.00 GB/s | >= 75 GB/s | FAIL | + +| Topology | NCCL Network | GPU Direct RDMA | GDR Disabled HCAs | +|----------|--------------|-----------------|-------------------| +| 2 nodes x 8 GPUs diagnostic | unknown | UNKNOWN | - | + +| Topology | Return Code | Error / Output Tail | +|----------|-------------|---------------------| +| 2 nodes x 8 GPUs diagnostic | 255 | lack of common network interfaces and/or no route found between them. Please check network connectivity (including firewalls and network routing requirements). -------------------------------------------------------------------------- | + +**Overall: FAIL** + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file