Add multinode NCCL all collectives run

2026-05-23 20:07:47 +08:00 · 2026-05-23 20:07:47 +08:00 · f5699bf85a
commit f5699bf85a
parent 59621d26a0
7 changed files with 428 additions and 2 deletions
--- a/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml
+++ b/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml
@ -0,0 +1,72 @@
+tools:
+  install_dir: /opt/gpu-test-tools
+
+report:
+  output_dir: ./reports
+  format: md
+
+multinode_nccl:
+  enabled: true
+  mode: cross-leaf-all-collectives-nccl-2.27.7
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: /data/nccl-tests-latest/build
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+    - broadcast_perf
+    - reduce_scatter_perf
+    - all_gather_perf
+    - sendrecv_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 8
+      label: 2 nodes x 8 GPUs (all collectives evidence run)
+      op_env:
+        alltoall:
+          NCCL_PXN_DISABLE: 1
+  begin_size: 16G
+  end_size: 16G
+  step_factor: 2
+  warmup_iters: 10
+  gpus_per_rank: 1
+  timeout_sec: 1800
+  debug: INFO
+  socket_ifname: bond0
+  oob_tcp_ifname: bond0
+  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: null
+  min_nchannels: null
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: null
+  extra_env:
+    NCCL_DEBUG_SUBSYS: INIT,NET
+    NCCL_NET_GDR_LEVEL: 5
+    NCCL_NET_GDR_READ: 1
+    NCCL_DMABUF_ENABLE: 0
+  min_peak_busbw_gbps:
+    allreduce: 491.84
+    alltoall: 76.54
+    broadcast: 0
+    reducescatter: 0
+    allgather: 0
+    sendrecv: 0
--- a/modules/multinode_nccl_test.py
+++ b/modules/multinode_nccl_test.py
@ -18,14 +18,29 @@ _TEST_ALIASES = {
    "allreduce": "all_reduce_perf",
    "all_reduce": "all_reduce_perf",
    "all_reduce_perf": "all_reduce_perf",
+    "allgather": "all_gather_perf",
+    "all_gather": "all_gather_perf",
+    "all_gather_perf": "all_gather_perf",
    "alltoall": "alltoall_perf",
    "all_to_all": "alltoall_perf",
    "alltoall_perf": "alltoall_perf",
+    "broadcast": "broadcast_perf",
+    "broadcast_perf": "broadcast_perf",
+    "reducescatter": "reduce_scatter_perf",
+    "reduce_scatter": "reduce_scatter_perf",
+    "reduce_scatter_perf": "reduce_scatter_perf",
+    "sendrecv": "sendrecv_perf",
+    "send_recv": "sendrecv_perf",
+    "sendrecv_perf": "sendrecv_perf",
 }

 _OP_LABELS = {
    "all_reduce_perf": "allreduce",
+    "all_gather_perf": "allgather",
    "alltoall_perf": "alltoall",
+    "broadcast_perf": "broadcast",
+    "reduce_scatter_perf": "reducescatter",
+    "sendrecv_perf": "sendrecv",
 }


--- a/reports_multinode_nccl_all_collectives_20260523_120144.md
+++ b/reports_multinode_nccl_all_collectives_20260523_120144.md
@ -0,0 +1,98 @@
+# GPU Test Report
+
+- **Date:** 2026-05-23T12:04:48.257734
+- **Host:** aikubeworker0012
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Failed or unverified items:
+- Multi-node NCCL: FAIL
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Multi-node NCCL | FAIL |
+
+## Multi-node NCCL / Cross Leaf
+
+Source: nccl-tests-mpirun | Mode: cross-leaf-all-collectives-nccl-2.27.7
+
+- **Artifacts:** `/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts`
+- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
+- **Preflight:** PASS
+
+### Multi-node NCCL allreduce
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | - | 354.27 GB/s | 16G | 354.45 GB/s | >= 491.84 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | 0 | nks 16 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0012:2208791:2208941 [0] NCCL INFO comm 0x557970d9f5f0 rank 0 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 354.452  #   |
+
+### Multi-node NCCL alltoall
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | - | 37.00 GB/s | 16G | 37.14 GB/s | >= 76.54 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | 0 | r0012:2208962:2209141 [5] NCCL INFO comm 0x564c4f9c4a30 rank 5 nranks 16 cudaDev 5 busId ab000 - Destroy COMPLETE aikubeworker0012:2208963:2209143 [6] NCCL INFO comm 0x56328e52f270 rank 6 nranks 16 cudaDev 6 busId ba000 - Destroy COMPLETE   |
+
+### Multi-node NCCL broadcast
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | - | 191.65 GB/s | 16G | 190.25 GB/s | - | PASS |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+### Multi-node NCCL reducescatter
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | - | 192.75 GB/s | 16G | 192.74 GB/s | - | PASS |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+### Multi-node NCCL allgather
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | - | 192.14 GB/s | 16G | 192.47 GB/s | - | PASS |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+### Multi-node NCCL sendrecv
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | - | 26.98 GB/s | 16G | 26.97 GB/s | - | PASS |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+**Overall: FAIL**
+
+---
+*Generated by GPU Test Suite v0.2.0*
--- a/reports_multinode_nccl_all_collectives_run_20260523.md
+++ b/reports_multinode_nccl_all_collectives_run_20260523.md
@ -0,0 +1,49 @@
+# 多机多卡 NCCL 六项 Collective 补测结果 2026-05-23
+
+## 测试对象
+
+- 节点：`nccl-gpu-1(172.72.8.12)` + `nccl-gpu-2(172.72.8.16)`
+- 拓扑：`2 nodes x 8 GPUs`
+- NCCL：`2.27.7`
+- nccl-tests：`/data/nccl-tests-latest/build`
+- 配置：`configs/multinode_nccl_nccl227_all_collectives_2x8.yaml`
+- 入口：`scripts/run_multinode_nccl_all_collectives.sh`
+- 远端报告：`/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144.md`
+- 远端 artifacts：`/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts`
+- 本地报告：`reports_multinode_nccl_all_collectives_20260523_120144.md`
+
+## 一句话结论
+
+这次补测已经把单机 `test all` 中的 6 个 NCCL collective 扩展到了多机 2x8 场景：`allreduce/alltoall/broadcast/reducescatter/allgather/sendrecv` 都能跑通，`returncode=0`、`wrong_count=0`，并且都走 `IB + GDRDMA`。按已知 PDF 2x8 阈值，`allreduce` 和 `alltoall` 仍 FAIL；新增的 4 项目前没有 PDF 跨节点阈值，因此只作为证据采集项，不判生产验收性能。
+
+## 结果表
+
+| Operation | Peak Bus BW | Threshold | Correctness | Network | Status |
+|---|---:|---:|---|---|---|
+| allreduce | `354.27 GB/s` | `>= 491.84 GB/s` | `wrong=0` | `IB/GDRDMA` | FAIL |
+| alltoall | `37.00 GB/s` | `>= 76.54 GB/s` | `wrong=0` | `IB/GDRDMA` | FAIL |
+| broadcast | `191.65 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
+| reducescatter | `192.75 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
+| allgather | `192.14 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
+| sendrecv | `26.98 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
+
+## 怎么解读
+
+1. 这次不是替代 PDF matrix，而是补齐多机多卡 collective 覆盖面。
+2. `allreduce/alltoall` 继续沿用已知 PDF 2x8 阈值，所以报告整体是 `FAIL`。
+3. `broadcast/reducescatter/allgather/sendrecv` 当前只能证明“多机 2x8 能跑、正确性为 0 wrong、走 IB/GDRDMA”，还不能证明生产性能达标，因为手头 PDF matrix 没给这 4 项跨节点阈值。
+4. 新增 4 项的带宽大致呈现两个层次：
+   - `broadcast/reducescatter/allgather` 在 `191-193 GB/s`，接近当前 4 x 400G rail 的单向原始上限。
+   - `sendrecv` 只有 `26.98 GB/s`，需要结合 sendrecv 的 traffic pattern 单独解读，不能直接和 allreduce busbw 混比。
+
+## 校验信息
+
+```text
+06c565281813c4260da9cfee8f0b0289b61b3be95c01dd670c71fa1a441133e3  reports/multinode_nccl_all_collectives_20260523_120144.md
+020eb35ddc5933da78b5c00c1b6fc25b11b23c4505300276d9736fbe8a35519b  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+47f68b7510df3b472e7ac0ec2fb53dcefbe687bb4de0c889f8947cc652d09e61  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+fa2828cdfcb86e6715a17c8bf45de10ce421c12f0877efff9bafb218b2f00df3  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+077fec1bf498fd202e2866f1cf6fb4502ac8d1bafba156f213453b21f6a6df2b  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+be24943eb4b63e304cee41831adeb23ffbbc0e890ff19b067e06d6a4b48b2d90  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+4560364922a85d21827357b906491aae8283c6148ff1c0e0f0dc379a68307fdd  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+```
--- a/reports_multinode_nccl_handoff_plan_20260523.md
+++ b/reports_multinode_nccl_handoff_plan_20260523.md
@ -16,6 +16,7 @@
 | 正式 PDF matrix 已复跑 | `reports_multinode_nccl_pdf_matrix_20260523_113803.md`，所有 case 正确性通过；除 2x2 allreduce 外，性能阈值仍 FAIL |
 | 原始 artifacts 已归档 | `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`，每个 case 有完整 `cmd/stdout/stderr/json` |
 | artifacts 信号已分析 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md`，确认所有 case 都走 IB/GDRDMA 和 4 条 400G HCA，未见 SHARP/CollNet |
+| 多机六项 collective 已补测 | `reports_multinode_nccl_all_collectives_run_20260523.md`，2x8 下 6 项均正确性通过，allreduce/alltoall 按 PDF 阈值仍 FAIL |
 | 没看到硬错误 | 未见 discard、RoCE retrans、slow restart、packet sequence error 等增长 |
 | 当前缺外部 NCCL 网络组件 | 未找到 `libnccl-net*.so*` / `libsharp*.so*`，未见 SHARP/HCOLL 包 |

@ -140,6 +141,15 @@ cd /root/test_gpu_scripts
 bash scripts/run_multinode_nccl_pdf_matrix.sh
 ```

+### 多机多卡 2x8 六项 collective 补测
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/run_multinode_nccl_all_collectives.sh
+```
+
+说明：这个入口用于补齐单机 `test all` 中已有、但多机 PDF matrix 还没覆盖的 NCCL collective。已知 PDF 2x8 阈值仍用于 `allreduce/alltoall`；新增的 `broadcast/reducescatter/allgather/sendrecv` 暂作为证据采集项，不强行套 PDF allreduce/alltoall 阈值。
+
 ### 完整深度诊断

 ```bash
@ -173,6 +183,8 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新多机多卡 PDF matrix 中文摘要 |
 | `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | 最新 artifacts manifest 和 checksum |
 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 |
+| `reports_multinode_nccl_all_collectives_20260523_120144.md` | 最新多机多卡 2x8 六项 collective 原始报告 |
+| `reports_multinode_nccl_all_collectives_run_20260523.md` | 最新多机多卡 2x8 六项 collective 中文摘要 |
 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮深度复跑结果 |
 | `reports_multinode_nccl_environment_gap_20260523.md` | 硬件/软件环境等价性缺口 |
 | `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail/counter 证据 |
@ -182,7 +194,9 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 | `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 |
 | `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 |
 | `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口；复跑时额外归档每个 case 的完整 `cmd/stdout/stderr/json` |
+| `scripts/run_multinode_nccl_all_collectives.sh` | 多机多卡 2x8 六项 collective 补测入口；复跑时额外归档每个 case 的完整 `cmd/stdout/stderr/json` |
 | `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置 |
+| `configs/multinode_nccl_nccl227_all_collectives_2x8.yaml` | 多机多卡 2x8 六项 collective 补测配置 |

 ## 当前建议

--- a/reports_multinode_nccl_latest_index_20260523.md
+++ b/reports_multinode_nccl_latest_index_20260523.md
@ -8,6 +8,7 @@

 - 2026-05-23 `11:38` 已完成带 artifacts 的正式多机多卡 PDF matrix 复跑，原始报告为 `reports_multinode_nccl_pdf_matrix_20260523_113803.md`，中文结论为 `reports_multinode_nccl_pdf_matrix_run_20260523.md`，artifact manifest 为 `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`。
 - 已补充 artifacts 信号分析：`reports_multinode_nccl_artifact_signal_analysis_20260523.md`。结论是所有 case 都走 `IB`，都使用 `mlx5_0,mlx5_1,mlx5_6,mlx5_7`，都有 GDRDMA 信号，但没有 SHARP/CollNet/外部 NCCL net plugin 证据。
+- 已补充并实跑多机多卡 2x8 六项 collective：`reports_multinode_nccl_all_collectives_run_20260523.md`。新增 `broadcast/reducescatter/allgather/sendrecv` 均 `returncode=0`、`wrong=0`、走 `IB/GDRDMA`；已知 PDF 阈值项 `allreduce/alltoall` 仍 FAIL。
 - 2 机 1/2/4 GPU per node 档位已接近 PDF 参考值，但严格按阈值仍 FAIL。
 - 2 机 8 GPU 档位仍未达到 PDF 参考值：
  - allreduce 实测 `353.85 GB/s busbw`，PDF 目标 `491.84 GB/s`。
@ -22,8 +23,9 @@
 | 1 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划，包含决策树、要问的问题和复跑命令 |
 | 2 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价，重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP |
 | 3 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 信号分析，确认 IB/GDRDMA/HCA 使用情况和 plugin/SHARP 缺口 |
-| 4 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
-| 5 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
+| 4 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机多卡 2x8 六项 collective 补测结果，补齐单机 test all 的 NCCL 覆盖面 |
+| 5 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
+| 6 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |

 ## 关键脚本

@ -33,7 +35,9 @@
 | `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本，不启动 NCCL workload |
 | `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口，默认同时采环境快照 |
 | `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口，跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall，并归档每个 case 的 command/stdout/stderr/parsed JSON |
+| `scripts/run_multinode_nccl_all_collectives.sh` | 多机多卡 2x8 六项 collective 补测入口，跑 allreduce/alltoall/broadcast/reducescatter/allgather/sendrecv，并归档每个 case |
 | `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置，固定 NCCL 2.27.7 和 `/data/nccl-tests-latest/build` |
+| `configs/multinode_nccl_nccl227_all_collectives_2x8.yaml` | 多机多卡 2x8 六项 collective 补测配置，allreduce/alltoall 保留 PDF 阈值，新增 4 项暂按证据采集 |
 | `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |

 多机多卡 PDF 矩阵：
@ -43,6 +47,13 @@ cd /root/test_gpu_scripts
 bash scripts/run_multinode_nccl_pdf_matrix.sh
 ```

+多机多卡 2x8 六项 collective 补测：
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/run_multinode_nccl_all_collectives.sh
+```
+
 单节点 H100 原始 all 报告：

 ```bash
@ -88,6 +99,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 /root/test_gpu_scripts/reports_multinode_nccl_handoff_plan_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_environment_gap_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_artifact_signal_analysis_20260523.md
+/root/test_gpu_scripts/reports_multinode_nccl_all_collectives_run_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_deep_diagnose_run_20260523.md
 ```

@ -123,6 +135,15 @@ summary: reports_multinode_nccl_pdf_matrix_run_20260523.md
 manifest: reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md
 ```

+最新多机多卡 2x8 六项 collective 补测：
+
+```text
+aikubeworker0012: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144.md
+artifacts: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts
+local copy: reports_multinode_nccl_all_collectives_20260523_120144.md
+summary: reports_multinode_nccl_all_collectives_run_20260523.md
+```
+
 下一次用 `scripts/run_multinode_nccl_pdf_matrix.sh` 复跑时，还会生成：

 ```text
@ -131,6 +152,14 @@ manifest: reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.m

 目录内按 case 保存完整 `cmd/stdout/stderr/json`，用于给网络/硬件侧复核原始 NCCL 输出。

+下一次用 `scripts/run_multinode_nccl_all_collectives.sh` 补测时，还会生成：
+
+```text
+/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_YYYYMMDD_HHMMSS_artifacts/
+```
+
+目录内按 6 个 collective 保存完整 `cmd/stdout/stderr/json`。该入口用于补齐单节点 `test all` 中已有、但多机 PDF matrix 未覆盖的 `broadcast/reducescatter/allgather/sendrecv` 证据；已知 PDF 2x8 阈值仍用于 `allreduce/alltoall`。
+
 ## 当前证据摘要

 ### HCA / rail
@ -200,6 +229,8 @@ PXN disabled sweep 未发现有效参数：
 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式 PDF matrix 中文摘要 |
 | `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | 最新 artifacts manifest 和 checksum |
 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 |
+| `reports_multinode_nccl_all_collectives_20260523_120144.md` | 最新多机多卡 2x8 六项 collective 原始报告 |
+| `reports_multinode_nccl_all_collectives_run_20260523.md` | 最新多机多卡 2x8 六项 collective 中文摘要 |
 | `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail 和 counter 证据 |
 | `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall PXN 和参数 sweep 结论 |
 | `reports_rdma_single_node_summary.md` | 单节点 RDMA/HCA 速率摘要 |
--- a/scripts/run_multinode_nccl_all_collectives.sh
+++ b/scripts/run_multinode_nccl_all_collectives.sh
@ -0,0 +1,147 @@
+#!/usr/bin/env bash
+set -uo pipefail
+
+# Run a two-node, eight-GPU-per-node NCCL evidence pass across the six
+# collectives used by the single-node H100 acceptance flow.
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
+
+PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
+CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml}"
+OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
+FORMAT="${FORMAT:-md}"
+DRY_RUN=0
+RUN_PREFLIGHT=1
+PREFLIGHT_ONLY=0
+
+usage() {
+  cat <<'EOF'
+Usage: run_multinode_nccl_all_collectives.sh [options]
+
+Options:
+  --python PATH       Python executable (default: /root/gpu-test-venv/bin/python)
+  --config PATH       Config file (default: configs/multinode_nccl_nccl227_all_collectives_2x8.yaml)
+  --out-dir PATH      Report output directory (default: reports)
+  --format FORMAT     Report format: md, json, or html (default: md)
+  --no-preflight      Skip scripts/multinode_nccl_deep_diagnose.sh preflight
+  --preflight-only    Run only the preflight check, not the workload
+  --dry-run           Print commands without running them
+  -h, --help          Show this help
+EOF
+}
+
+while (($#)); do
+  case "$1" in
+    --python)
+      PYTHON_BIN="$2"
+      shift 2
+      ;;
+    --config)
+      CONFIG_FILE="$2"
+      shift 2
+      ;;
+    --out-dir)
+      OUT_DIR="$2"
+      shift 2
+      ;;
+    --format)
+      FORMAT="$2"
+      shift 2
+      ;;
+    --no-preflight)
+      RUN_PREFLIGHT=0
+      shift
+      ;;
+    --preflight-only)
+      PREFLIGHT_ONLY=1
+      shift
+      ;;
+    --dry-run)
+      DRY_RUN=1
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      usage >&2
+      exit 2
+      ;;
+  esac
+done
+
+if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
+  echo "Unsupported format: $FORMAT" >&2
+  exit 2
+fi
+
+if [[ ! -x "$PYTHON_BIN" ]]; then
+  PYTHON_BIN="$(command -v python3 || true)"
+fi
+
+if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
+  echo "Python executable not found. Set --python or PYTHON_BIN." >&2
+  exit 1
+fi
+
+TS="$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUT_DIR"
+
+REPORT_FILE="$OUT_DIR/multinode_nccl_all_collectives_${TS}.${FORMAT}"
+ARTIFACT_DIR="$OUT_DIR/multinode_nccl_all_collectives_${TS}_artifacts"
+PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight)
+RUN_CMD=(
+  "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
+  --config "$CONFIG_FILE"
+  --test multinode-nccl
+  --report
+  --format "$FORMAT"
+  --output "$REPORT_FILE"
+)
+
+echo "Project: $PROJECT_DIR"
+echo "Config: $CONFIG_FILE"
+echo "Report: $REPORT_FILE"
+echo "Artifacts: $ARTIFACT_DIR"
+echo "Collectives: allreduce, alltoall, broadcast, reducescatter, allgather, sendrecv"
+echo "Topology: 2 nodes x 8 GPUs per node; 16G"
+
+if ((DRY_RUN)); then
+  if ((RUN_PREFLIGHT)); then
+    printf 'DRY RUN preflight:'
+    printf ' %q' "${PREFLIGHT_CMD[@]}"
+    printf '\n'
+  fi
+  if ((PREFLIGHT_ONLY)); then
+    exit 0
+  fi
+  printf 'DRY RUN workload:'
+  printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR"
+  printf ' %q' "${RUN_CMD[@]}"
+  printf '\n'
+  exit 0
+fi
+
+if ((RUN_PREFLIGHT)); then
+  "${PREFLIGHT_CMD[@]}"
+  preflight_status=$?
+  if ((preflight_status != 0)); then
+    echo "Preflight failed with exit code $preflight_status" >&2
+    exit "$preflight_status"
+  fi
+fi
+
+if ((PREFLIGHT_ONLY)); then
+  exit 0
+fi
+
+mkdir -p "$ARTIFACT_DIR"
+MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${RUN_CMD[@]}"
+status=$?
+
+echo "Report written to: $REPORT_FILE"
+echo "Artifacts written to: $ARTIFACT_DIR"
+exit "$status"