From f5699bf85af288d568c0b85f2d7aa59824176a0a Mon Sep 17 00:00:00 2001 From: cs Date: Sat, 23 May 2026 20:07:47 +0800 Subject: [PATCH] Add multinode NCCL all collectives run --- ...node_nccl_nccl227_all_collectives_2x8.yaml | 72 +++++++++ modules/multinode_nccl_test.py | 15 ++ ...de_nccl_all_collectives_20260523_120144.md | 98 ++++++++++++ ...inode_nccl_all_collectives_run_20260523.md | 49 ++++++ ...ts_multinode_nccl_handoff_plan_20260523.md | 14 ++ ...ts_multinode_nccl_latest_index_20260523.md | 35 ++++- scripts/run_multinode_nccl_all_collectives.sh | 147 ++++++++++++++++++ 7 files changed, 428 insertions(+), 2 deletions(-) create mode 100644 configs/multinode_nccl_nccl227_all_collectives_2x8.yaml create mode 100644 reports_multinode_nccl_all_collectives_20260523_120144.md create mode 100644 reports_multinode_nccl_all_collectives_run_20260523.md create mode 100755 scripts/run_multinode_nccl_all_collectives.sh diff --git a/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml b/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml new file mode 100644 index 0000000..1e5d464 --- /dev/null +++ b/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml @@ -0,0 +1,72 @@ +tools: + install_dir: /opt/gpu-test-tools + +report: + output_dir: ./reports + format: md + +multinode_nccl: + enabled: true + mode: cross-leaf-all-collectives-nccl-2.27.7 + hosts: + - name: nccl-gpu-1 + addr: 172.72.8.12 + slots: 8 + - name: nccl-gpu-2 + addr: 172.72.8.16 + slots: 8 + ssh_user: root + ssh_preflight: true + mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun + mpi_ld_preload: null + extra_ld_library_path: + - /usr/mpi/gcc/openmpi-4.1.9a1/lib + - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu + - /usr/local/cuda-12.4/targets/x86_64-linux/lib + nccl_tests_dir: /data/nccl-tests-latest/build + tests: + - all_reduce_perf + - alltoall_perf + - broadcast_perf + - reduce_scatter_perf + - all_gather_perf + - sendrecv_perf + topologies: + - nodes: 2 + gpus_per_node: 8 + label: 2 nodes x 8 GPUs (all collectives evidence run) + op_env: + alltoall: + NCCL_PXN_DISABLE: 1 + begin_size: 16G + end_size: 16G + step_factor: 2 + warmup_iters: 10 + gpus_per_rank: 1 + timeout_sec: 1800 + debug: INFO + socket_ifname: bond0 + oob_tcp_ifname: bond0 + plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30" + ib_gid_index: 3 + ib_sl: 5 + ib_tc: 136 + ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 + ib_timeout: 22 + qps_per_connection: null + min_nchannels: null + net_plugin: none + nvls_enable: 1 + split_data_on_qps: null + extra_env: + NCCL_DEBUG_SUBSYS: INIT,NET + NCCL_NET_GDR_LEVEL: 5 + NCCL_NET_GDR_READ: 1 + NCCL_DMABUF_ENABLE: 0 + min_peak_busbw_gbps: + allreduce: 491.84 + alltoall: 76.54 + broadcast: 0 + reducescatter: 0 + allgather: 0 + sendrecv: 0 diff --git a/modules/multinode_nccl_test.py b/modules/multinode_nccl_test.py index c5afed6..f97c96e 100644 --- a/modules/multinode_nccl_test.py +++ b/modules/multinode_nccl_test.py @@ -18,14 +18,29 @@ _TEST_ALIASES = { "allreduce": "all_reduce_perf", "all_reduce": "all_reduce_perf", "all_reduce_perf": "all_reduce_perf", + "allgather": "all_gather_perf", + "all_gather": "all_gather_perf", + "all_gather_perf": "all_gather_perf", "alltoall": "alltoall_perf", "all_to_all": "alltoall_perf", "alltoall_perf": "alltoall_perf", + "broadcast": "broadcast_perf", + "broadcast_perf": "broadcast_perf", + "reducescatter": "reduce_scatter_perf", + "reduce_scatter": "reduce_scatter_perf", + "reduce_scatter_perf": "reduce_scatter_perf", + "sendrecv": "sendrecv_perf", + "send_recv": "sendrecv_perf", + "sendrecv_perf": "sendrecv_perf", } _OP_LABELS = { "all_reduce_perf": "allreduce", + "all_gather_perf": "allgather", "alltoall_perf": "alltoall", + "broadcast_perf": "broadcast", + "reduce_scatter_perf": "reducescatter", + "sendrecv_perf": "sendrecv", } diff --git a/reports_multinode_nccl_all_collectives_20260523_120144.md b/reports_multinode_nccl_all_collectives_20260523_120144.md new file mode 100644 index 0000000..2b1d604 --- /dev/null +++ b/reports_multinode_nccl_all_collectives_20260523_120144.md @@ -0,0 +1,98 @@ +# GPU Test Report + +- **Date:** 2026-05-23T12:04:48.257734 +- **Host:** aikubeworker0012 + +## Overall Acceptance Verdict + +**Result: FAIL** + +Failed or unverified items: +- Multi-node NCCL: FAIL + +## Summary + +| Test | Result | +|------|--------| +| Multi-node NCCL | FAIL | + +## Multi-node NCCL / Cross Leaf + +Source: nccl-tests-mpirun | Mode: cross-leaf-all-collectives-nccl-2.27.7 + +- **Artifacts:** `/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts` +- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16) +- **Preflight:** PASS + +### Multi-node NCCL allreduce + +| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status | +|----------|----------------------|-------------|-----------|------------|-----------|--------| +| 2 nodes x 8 GPUs (all collectives evidence run) | - | 354.27 GB/s | 16G | 354.45 GB/s | >= 491.84 GB/s | FAIL | + +| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs | +|----------|--------------|-----------------|------------------|-------------------| +| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - | + +| Topology | Return Code | Error / Output Tail | +|----------|-------------|---------------------| +| 2 nodes x 8 GPUs (all collectives evidence run) | 0 | nks 16 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0012:2208791:2208941 [0] NCCL INFO comm 0x557970d9f5f0 rank 0 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth : 354.452 # | + +### Multi-node NCCL alltoall + +| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status | +|----------|----------------------|-------------|-----------|------------|-----------|--------| +| 2 nodes x 8 GPUs (all collectives evidence run) | - | 37.00 GB/s | 16G | 37.14 GB/s | >= 76.54 GB/s | FAIL | + +| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs | +|----------|--------------|-----------------|------------------|-------------------| +| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - | + +| Topology | Return Code | Error / Output Tail | +|----------|-------------|---------------------| +| 2 nodes x 8 GPUs (all collectives evidence run) | 0 | r0012:2208962:2209141 [5] NCCL INFO comm 0x564c4f9c4a30 rank 5 nranks 16 cudaDev 5 busId ab000 - Destroy COMPLETE aikubeworker0012:2208963:2209143 [6] NCCL INFO comm 0x56328e52f270 rank 6 nranks 16 cudaDev 6 busId ba000 - Destroy COMPLETE | + +### Multi-node NCCL broadcast + +| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status | +|----------|----------------------|-------------|-----------|------------|-----------|--------| +| 2 nodes x 8 GPUs (all collectives evidence run) | - | 191.65 GB/s | 16G | 190.25 GB/s | - | PASS | + +| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs | +|----------|--------------|-----------------|------------------|-------------------| +| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - | + +### Multi-node NCCL reducescatter + +| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status | +|----------|----------------------|-------------|-----------|------------|-----------|--------| +| 2 nodes x 8 GPUs (all collectives evidence run) | - | 192.75 GB/s | 16G | 192.74 GB/s | - | PASS | + +| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs | +|----------|--------------|-----------------|------------------|-------------------| +| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - | + +### Multi-node NCCL allgather + +| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status | +|----------|----------------------|-------------|-----------|------------|-----------|--------| +| 2 nodes x 8 GPUs (all collectives evidence run) | - | 192.14 GB/s | 16G | 192.47 GB/s | - | PASS | + +| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs | +|----------|--------------|-----------------|------------------|-------------------| +| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - | + +### Multi-node NCCL sendrecv + +| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status | +|----------|----------------------|-------------|-----------|------------|-----------|--------| +| 2 nodes x 8 GPUs (all collectives evidence run) | - | 26.98 GB/s | 16G | 26.97 GB/s | - | PASS | + +| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs | +|----------|--------------|-----------------|------------------|-------------------| +| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - | + +**Overall: FAIL** + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_multinode_nccl_all_collectives_run_20260523.md b/reports_multinode_nccl_all_collectives_run_20260523.md new file mode 100644 index 0000000..9468190 --- /dev/null +++ b/reports_multinode_nccl_all_collectives_run_20260523.md @@ -0,0 +1,49 @@ +# 多机多卡 NCCL 六项 Collective 补测结果 2026-05-23 + +## 测试对象 + +- 节点:`nccl-gpu-1(172.72.8.12)` + `nccl-gpu-2(172.72.8.16)` +- 拓扑:`2 nodes x 8 GPUs` +- NCCL:`2.27.7` +- nccl-tests:`/data/nccl-tests-latest/build` +- 配置:`configs/multinode_nccl_nccl227_all_collectives_2x8.yaml` +- 入口:`scripts/run_multinode_nccl_all_collectives.sh` +- 远端报告:`/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144.md` +- 远端 artifacts:`/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts` +- 本地报告:`reports_multinode_nccl_all_collectives_20260523_120144.md` + +## 一句话结论 + +这次补测已经把单机 `test all` 中的 6 个 NCCL collective 扩展到了多机 2x8 场景:`allreduce/alltoall/broadcast/reducescatter/allgather/sendrecv` 都能跑通,`returncode=0`、`wrong_count=0`,并且都走 `IB + GDRDMA`。按已知 PDF 2x8 阈值,`allreduce` 和 `alltoall` 仍 FAIL;新增的 4 项目前没有 PDF 跨节点阈值,因此只作为证据采集项,不判生产验收性能。 + +## 结果表 + +| Operation | Peak Bus BW | Threshold | Correctness | Network | Status | +|---|---:|---:|---|---|---| +| allreduce | `354.27 GB/s` | `>= 491.84 GB/s` | `wrong=0` | `IB/GDRDMA` | FAIL | +| alltoall | `37.00 GB/s` | `>= 76.54 GB/s` | `wrong=0` | `IB/GDRDMA` | FAIL | +| broadcast | `191.65 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence | +| reducescatter | `192.75 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence | +| allgather | `192.14 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence | +| sendrecv | `26.98 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence | + +## 怎么解读 + +1. 这次不是替代 PDF matrix,而是补齐多机多卡 collective 覆盖面。 +2. `allreduce/alltoall` 继续沿用已知 PDF 2x8 阈值,所以报告整体是 `FAIL`。 +3. `broadcast/reducescatter/allgather/sendrecv` 当前只能证明“多机 2x8 能跑、正确性为 0 wrong、走 IB/GDRDMA”,还不能证明生产性能达标,因为手头 PDF matrix 没给这 4 项跨节点阈值。 +4. 新增 4 项的带宽大致呈现两个层次: + - `broadcast/reducescatter/allgather` 在 `191-193 GB/s`,接近当前 4 x 400G rail 的单向原始上限。 + - `sendrecv` 只有 `26.98 GB/s`,需要结合 sendrecv 的 traffic pattern 单独解读,不能直接和 allreduce busbw 混比。 + +## 校验信息 + +```text +06c565281813c4260da9cfee8f0b0289b61b3be95c01dd670c71fa1a441133e3 reports/multinode_nccl_all_collectives_20260523_120144.md +020eb35ddc5933da78b5c00c1b6fc25b11b23c4505300276d9736fbe8a35519b reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +47f68b7510df3b472e7ac0ec2fb53dcefbe687bb4de0c889f8947cc652d09e61 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +fa2828cdfcb86e6715a17c8bf45de10ce421c12f0877efff9bafb218b2f00df3 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +077fec1bf498fd202e2866f1cf6fb4502ac8d1bafba156f213453b21f6a6df2b reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +be24943eb4b63e304cee41831adeb23ffbbc0e890ff19b067e06d6a4b48b2d90 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +4560364922a85d21827357b906491aae8283c6148ff1c0e0f0dc379a68307fdd reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +``` diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md index e91ff01..80b27c5 100644 --- a/reports_multinode_nccl_handoff_plan_20260523.md +++ b/reports_multinode_nccl_handoff_plan_20260523.md @@ -16,6 +16,7 @@ | 正式 PDF matrix 已复跑 | `reports_multinode_nccl_pdf_matrix_20260523_113803.md`,所有 case 正确性通过;除 2x2 allreduce 外,性能阈值仍 FAIL | | 原始 artifacts 已归档 | `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`,每个 case 有完整 `cmd/stdout/stderr/json` | | artifacts 信号已分析 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md`,确认所有 case 都走 IB/GDRDMA 和 4 条 400G HCA,未见 SHARP/CollNet | +| 多机六项 collective 已补测 | `reports_multinode_nccl_all_collectives_run_20260523.md`,2x8 下 6 项均正确性通过,allreduce/alltoall 按 PDF 阈值仍 FAIL | | 没看到硬错误 | 未见 discard、RoCE retrans、slow restart、packet sequence error 等增长 | | 当前缺外部 NCCL 网络组件 | 未找到 `libnccl-net*.so*` / `libsharp*.so*`,未见 SHARP/HCOLL 包 | @@ -140,6 +141,15 @@ cd /root/test_gpu_scripts bash scripts/run_multinode_nccl_pdf_matrix.sh ``` +### 多机多卡 2x8 六项 collective 补测 + +```bash +cd /root/test_gpu_scripts +bash scripts/run_multinode_nccl_all_collectives.sh +``` + +说明:这个入口用于补齐单机 `test all` 中已有、但多机 PDF matrix 还没覆盖的 NCCL collective。已知 PDF 2x8 阈值仍用于 `allreduce/alltoall`;新增的 `broadcast/reducescatter/allgather/sendrecv` 暂作为证据采集项,不强行套 PDF allreduce/alltoall 阈值。 + ### 完整深度诊断 ```bash @@ -173,6 +183,8 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m% | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新多机多卡 PDF matrix 中文摘要 | | `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | 最新 artifacts manifest 和 checksum | | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 | +| `reports_multinode_nccl_all_collectives_20260523_120144.md` | 最新多机多卡 2x8 六项 collective 原始报告 | +| `reports_multinode_nccl_all_collectives_run_20260523.md` | 最新多机多卡 2x8 六项 collective 中文摘要 | | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮深度复跑结果 | | `reports_multinode_nccl_environment_gap_20260523.md` | 硬件/软件环境等价性缺口 | | `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail/counter 证据 | @@ -182,7 +194,9 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m% | `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 | | `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 | | `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口;复跑时额外归档每个 case 的完整 `cmd/stdout/stderr/json` | +| `scripts/run_multinode_nccl_all_collectives.sh` | 多机多卡 2x8 六项 collective 补测入口;复跑时额外归档每个 case 的完整 `cmd/stdout/stderr/json` | | `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置 | +| `configs/multinode_nccl_nccl227_all_collectives_2x8.yaml` | 多机多卡 2x8 六项 collective 补测配置 | ## 当前建议 diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md index 2ff15e1..ebc3481 100644 --- a/reports_multinode_nccl_latest_index_20260523.md +++ b/reports_multinode_nccl_latest_index_20260523.md @@ -8,6 +8,7 @@ - 2026-05-23 `11:38` 已完成带 artifacts 的正式多机多卡 PDF matrix 复跑,原始报告为 `reports_multinode_nccl_pdf_matrix_20260523_113803.md`,中文结论为 `reports_multinode_nccl_pdf_matrix_run_20260523.md`,artifact manifest 为 `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`。 - 已补充 artifacts 信号分析:`reports_multinode_nccl_artifact_signal_analysis_20260523.md`。结论是所有 case 都走 `IB`,都使用 `mlx5_0,mlx5_1,mlx5_6,mlx5_7`,都有 GDRDMA 信号,但没有 SHARP/CollNet/外部 NCCL net plugin 证据。 +- 已补充并实跑多机多卡 2x8 六项 collective:`reports_multinode_nccl_all_collectives_run_20260523.md`。新增 `broadcast/reducescatter/allgather/sendrecv` 均 `returncode=0`、`wrong=0`、走 `IB/GDRDMA`;已知 PDF 阈值项 `allreduce/alltoall` 仍 FAIL。 - 2 机 1/2/4 GPU per node 档位已接近 PDF 参考值,但严格按阈值仍 FAIL。 - 2 机 8 GPU 档位仍未达到 PDF 参考值: - allreduce 实测 `353.85 GB/s busbw`,PDF 目标 `491.84 GB/s`。 @@ -22,8 +23,9 @@ | 1 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划,包含决策树、要问的问题和复跑命令 | | 2 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价,重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP | | 3 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 信号分析,确认 IB/GDRDMA/HCA 使用情况和 plugin/SHARP 缺口 | -| 4 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 | -| 5 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果,包含 counter、GRAPH、PXN sweep | +| 4 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机多卡 2x8 六项 collective 补测结果,补齐单机 test all 的 NCCL 覆盖面 | +| 5 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 | +| 6 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果,包含 counter、GRAPH、PXN sweep | ## 关键脚本 @@ -33,7 +35,9 @@ | `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本,不启动 NCCL workload | | `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口,默认同时采环境快照 | | `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口,跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall,并归档每个 case 的 command/stdout/stderr/parsed JSON | +| `scripts/run_multinode_nccl_all_collectives.sh` | 多机多卡 2x8 六项 collective 补测入口,跑 allreduce/alltoall/broadcast/reducescatter/allgather/sendrecv,并归档每个 case | | `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置,固定 NCCL 2.27.7 和 `/data/nccl-tests-latest/build` | +| `configs/multinode_nccl_nccl227_all_collectives_2x8.yaml` | 多机多卡 2x8 六项 collective 补测配置,allreduce/alltoall 保留 PDF 阈值,新增 4 项暂按证据采集 | | `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook | 多机多卡 PDF 矩阵: @@ -43,6 +47,13 @@ cd /root/test_gpu_scripts bash scripts/run_multinode_nccl_pdf_matrix.sh ``` +多机多卡 2x8 六项 collective 补测: + +```bash +cd /root/test_gpu_scripts +bash scripts/run_multinode_nccl_all_collectives.sh +``` + 单节点 H100 原始 all 报告: ```bash @@ -88,6 +99,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m% /root/test_gpu_scripts/reports_multinode_nccl_handoff_plan_20260523.md /root/test_gpu_scripts/reports_multinode_nccl_environment_gap_20260523.md /root/test_gpu_scripts/reports_multinode_nccl_artifact_signal_analysis_20260523.md +/root/test_gpu_scripts/reports_multinode_nccl_all_collectives_run_20260523.md /root/test_gpu_scripts/reports_multinode_nccl_deep_diagnose_run_20260523.md ``` @@ -123,6 +135,15 @@ summary: reports_multinode_nccl_pdf_matrix_run_20260523.md manifest: reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md ``` +最新多机多卡 2x8 六项 collective 补测: + +```text +aikubeworker0012: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144.md +artifacts: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts +local copy: reports_multinode_nccl_all_collectives_20260523_120144.md +summary: reports_multinode_nccl_all_collectives_run_20260523.md +``` + 下一次用 `scripts/run_multinode_nccl_pdf_matrix.sh` 复跑时,还会生成: ```text @@ -131,6 +152,14 @@ manifest: reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.m 目录内按 case 保存完整 `cmd/stdout/stderr/json`,用于给网络/硬件侧复核原始 NCCL 输出。 +下一次用 `scripts/run_multinode_nccl_all_collectives.sh` 补测时,还会生成: + +```text +/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_YYYYMMDD_HHMMSS_artifacts/ +``` + +目录内按 6 个 collective 保存完整 `cmd/stdout/stderr/json`。该入口用于补齐单节点 `test all` 中已有、但多机 PDF matrix 未覆盖的 `broadcast/reducescatter/allgather/sendrecv` 证据;已知 PDF 2x8 阈值仍用于 `allreduce/alltoall`。 + ## 当前证据摘要 ### HCA / rail @@ -200,6 +229,8 @@ PXN disabled sweep 未发现有效参数: | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式 PDF matrix 中文摘要 | | `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | 最新 artifacts manifest 和 checksum | | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 | +| `reports_multinode_nccl_all_collectives_20260523_120144.md` | 最新多机多卡 2x8 六项 collective 原始报告 | +| `reports_multinode_nccl_all_collectives_run_20260523.md` | 最新多机多卡 2x8 六项 collective 中文摘要 | | `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail 和 counter 证据 | | `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall PXN 和参数 sweep 结论 | | `reports_rdma_single_node_summary.md` | 单节点 RDMA/HCA 速率摘要 | diff --git a/scripts/run_multinode_nccl_all_collectives.sh b/scripts/run_multinode_nccl_all_collectives.sh new file mode 100755 index 0000000..819e893 --- /dev/null +++ b/scripts/run_multinode_nccl_all_collectives.sh @@ -0,0 +1,147 @@ +#!/usr/bin/env bash +set -uo pipefail + +# Run a two-node, eight-GPU-per-node NCCL evidence pass across the six +# collectives used by the single-node H100 acceptance flow. + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)" + +PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}" +CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml}" +OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}" +FORMAT="${FORMAT:-md}" +DRY_RUN=0 +RUN_PREFLIGHT=1 +PREFLIGHT_ONLY=0 + +usage() { + cat <<'EOF' +Usage: run_multinode_nccl_all_collectives.sh [options] + +Options: + --python PATH Python executable (default: /root/gpu-test-venv/bin/python) + --config PATH Config file (default: configs/multinode_nccl_nccl227_all_collectives_2x8.yaml) + --out-dir PATH Report output directory (default: reports) + --format FORMAT Report format: md, json, or html (default: md) + --no-preflight Skip scripts/multinode_nccl_deep_diagnose.sh preflight + --preflight-only Run only the preflight check, not the workload + --dry-run Print commands without running them + -h, --help Show this help +EOF +} + +while (($#)); do + case "$1" in + --python) + PYTHON_BIN="$2" + shift 2 + ;; + --config) + CONFIG_FILE="$2" + shift 2 + ;; + --out-dir) + OUT_DIR="$2" + shift 2 + ;; + --format) + FORMAT="$2" + shift 2 + ;; + --no-preflight) + RUN_PREFLIGHT=0 + shift + ;; + --preflight-only) + PREFLIGHT_ONLY=1 + shift + ;; + --dry-run) + DRY_RUN=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage >&2 + exit 2 + ;; + esac +done + +if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then + echo "Unsupported format: $FORMAT" >&2 + exit 2 +fi + +if [[ ! -x "$PYTHON_BIN" ]]; then + PYTHON_BIN="$(command -v python3 || true)" +fi + +if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then + echo "Python executable not found. Set --python or PYTHON_BIN." >&2 + exit 1 +fi + +TS="$(date +%Y%m%d_%H%M%S)" +mkdir -p "$OUT_DIR" + +REPORT_FILE="$OUT_DIR/multinode_nccl_all_collectives_${TS}.${FORMAT}" +ARTIFACT_DIR="$OUT_DIR/multinode_nccl_all_collectives_${TS}_artifacts" +PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight) +RUN_CMD=( + "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py" + --config "$CONFIG_FILE" + --test multinode-nccl + --report + --format "$FORMAT" + --output "$REPORT_FILE" +) + +echo "Project: $PROJECT_DIR" +echo "Config: $CONFIG_FILE" +echo "Report: $REPORT_FILE" +echo "Artifacts: $ARTIFACT_DIR" +echo "Collectives: allreduce, alltoall, broadcast, reducescatter, allgather, sendrecv" +echo "Topology: 2 nodes x 8 GPUs per node; 16G" + +if ((DRY_RUN)); then + if ((RUN_PREFLIGHT)); then + printf 'DRY RUN preflight:' + printf ' %q' "${PREFLIGHT_CMD[@]}" + printf '\n' + fi + if ((PREFLIGHT_ONLY)); then + exit 0 + fi + printf 'DRY RUN workload:' + printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR" + printf ' %q' "${RUN_CMD[@]}" + printf '\n' + exit 0 +fi + +if ((RUN_PREFLIGHT)); then + "${PREFLIGHT_CMD[@]}" + preflight_status=$? + if ((preflight_status != 0)); then + echo "Preflight failed with exit code $preflight_status" >&2 + exit "$preflight_status" + fi +fi + +if ((PREFLIGHT_ONLY)); then + exit 0 +fi + +mkdir -p "$ARTIFACT_DIR" +MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${RUN_CMD[@]}" +status=$? + +echo "Report written to: $REPORT_FILE" +echo "Artifacts written to: $ARTIFACT_DIR" +exit "$status"