Add multinode NCCL all collectives run

This commit is contained in:
cs 2026-05-23 20:07:47 +08:00
parent e0cb796b0c
commit c2db68f608
6 changed files with 413 additions and 2 deletions

View File

@ -0,0 +1,72 @@
tools:
install_dir: /opt/gpu-test-tools
report:
output_dir: ./reports
format: md
multinode_nccl:
enabled: true
mode: cross-leaf-all-collectives-nccl-2.27.7
hosts:
- name: nccl-gpu-1
addr: 172.72.8.12
slots: 8
- name: nccl-gpu-2
addr: 172.72.8.16
slots: 8
ssh_user: root
ssh_preflight: true
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
mpi_ld_preload: null
extra_ld_library_path:
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
nccl_tests_dir: /data/nccl-tests-latest/build
tests:
- all_reduce_perf
- alltoall_perf
- broadcast_perf
- reduce_scatter_perf
- all_gather_perf
- sendrecv_perf
topologies:
- nodes: 2
gpus_per_node: 8
label: 2 nodes x 8 GPUs (all collectives evidence run)
op_env:
alltoall:
NCCL_PXN_DISABLE: 1
begin_size: 16G
end_size: 16G
step_factor: 2
warmup_iters: 10
gpus_per_rank: 1
timeout_sec: 1800
debug: INFO
socket_ifname: bond0
oob_tcp_ifname: bond0
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
ib_gid_index: 3
ib_sl: 5
ib_tc: 136
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
ib_timeout: 22
qps_per_connection: null
min_nchannels: null
net_plugin: none
nvls_enable: 1
split_data_on_qps: null
extra_env:
NCCL_DEBUG_SUBSYS: INIT,NET
NCCL_NET_GDR_LEVEL: 5
NCCL_NET_GDR_READ: 1
NCCL_DMABUF_ENABLE: 0
min_peak_busbw_gbps:
allreduce: 491.84
alltoall: 76.54
broadcast: 0
reducescatter: 0
allgather: 0
sendrecv: 0

View File

@ -0,0 +1,98 @@
# GPU Test Report
- **Date:** 2026-05-23T12:04:48.257734
- **Host:** aikubeworker0012
## Overall Acceptance Verdict
**Result: FAIL**
Failed or unverified items:
- Multi-node NCCL: FAIL
## Summary
| Test | Result |
|------|--------|
| Multi-node NCCL | FAIL |
## Multi-node NCCL / Cross Leaf
Source: nccl-tests-mpirun | Mode: cross-leaf-all-collectives-nccl-2.27.7
- **Artifacts:** `/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts`
- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
- **Preflight:** PASS
### Multi-node NCCL allreduce
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|----------|----------------------|-------------|-----------|------------|-----------|--------|
| 2 nodes x 8 GPUs (all collectives evidence run) | - | 354.27 GB/s | 16G | 354.45 GB/s | >= 491.84 GB/s | FAIL |
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|----------|--------------|-----------------|------------------|-------------------|
| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
| Topology | Return Code | Error / Output Tail |
|----------|-------------|---------------------|
| 2 nodes x 8 GPUs (all collectives evidence run) | 0 | nks 16 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0012:2208791:2208941 [0] NCCL INFO comm 0x557970d9f5f0 rank 0 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth : 354.452 # |
### Multi-node NCCL alltoall
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|----------|----------------------|-------------|-----------|------------|-----------|--------|
| 2 nodes x 8 GPUs (all collectives evidence run) | - | 37.00 GB/s | 16G | 37.14 GB/s | >= 76.54 GB/s | FAIL |
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|----------|--------------|-----------------|------------------|-------------------|
| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
| Topology | Return Code | Error / Output Tail |
|----------|-------------|---------------------|
| 2 nodes x 8 GPUs (all collectives evidence run) | 0 | r0012:2208962:2209141 [5] NCCL INFO comm 0x564c4f9c4a30 rank 5 nranks 16 cudaDev 5 busId ab000 - Destroy COMPLETE aikubeworker0012:2208963:2209143 [6] NCCL INFO comm 0x56328e52f270 rank 6 nranks 16 cudaDev 6 busId ba000 - Destroy COMPLETE |
### Multi-node NCCL broadcast
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|----------|----------------------|-------------|-----------|------------|-----------|--------|
| 2 nodes x 8 GPUs (all collectives evidence run) | - | 191.65 GB/s | 16G | 190.25 GB/s | - | PASS |
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|----------|--------------|-----------------|------------------|-------------------|
| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
### Multi-node NCCL reducescatter
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|----------|----------------------|-------------|-----------|------------|-----------|--------|
| 2 nodes x 8 GPUs (all collectives evidence run) | - | 192.75 GB/s | 16G | 192.74 GB/s | - | PASS |
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|----------|--------------|-----------------|------------------|-------------------|
| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
### Multi-node NCCL allgather
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|----------|----------------------|-------------|-----------|------------|-----------|--------|
| 2 nodes x 8 GPUs (all collectives evidence run) | - | 192.14 GB/s | 16G | 192.47 GB/s | - | PASS |
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|----------|--------------|-----------------|------------------|-------------------|
| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
### Multi-node NCCL sendrecv
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|----------|----------------------|-------------|-----------|------------|-----------|--------|
| 2 nodes x 8 GPUs (all collectives evidence run) | - | 26.98 GB/s | 16G | 26.97 GB/s | - | PASS |
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|----------|--------------|-----------------|------------------|-------------------|
| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
**Overall: FAIL**
---
*Generated by GPU Test Suite v0.2.0*

View File

@ -0,0 +1,49 @@
# 多机多卡 NCCL 六项 Collective 补测结果 2026-05-23
## 测试对象
- 节点:`nccl-gpu-1(172.72.8.12)` + `nccl-gpu-2(172.72.8.16)`
- 拓扑:`2 nodes x 8 GPUs`
- NCCL`2.27.7`
- nccl-tests`/data/nccl-tests-latest/build`
- 配置:`configs/multinode_nccl_nccl227_all_collectives_2x8.yaml`
- 入口:`scripts/run_multinode_nccl_all_collectives.sh`
- 远端报告:`/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144.md`
- 远端 artifacts`/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts`
- 本地报告:`reports_multinode_nccl_all_collectives_20260523_120144.md`
## 一句话结论
这次补测已经把单机 `test all` 中的 6 个 NCCL collective 扩展到了多机 2x8 场景:`allreduce/alltoall/broadcast/reducescatter/allgather/sendrecv` 都能跑通,`returncode=0``wrong_count=0`,并且都走 `IB + GDRDMA`。按已知 PDF 2x8 阈值,`allreduce``alltoall` 仍 FAIL新增的 4 项目前没有 PDF 跨节点阈值,因此只作为证据采集项,不判生产验收性能。
## 结果表
| Operation | Peak Bus BW | Threshold | Correctness | Network | Status |
|---|---:|---:|---|---|---|
| allreduce | `354.27 GB/s` | `>= 491.84 GB/s` | `wrong=0` | `IB/GDRDMA` | FAIL |
| alltoall | `37.00 GB/s` | `>= 76.54 GB/s` | `wrong=0` | `IB/GDRDMA` | FAIL |
| broadcast | `191.65 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
| reducescatter | `192.75 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
| allgather | `192.14 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
| sendrecv | `26.98 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
## 怎么解读
1. 这次不是替代 PDF matrix而是补齐多机多卡 collective 覆盖面。
2. `allreduce/alltoall` 继续沿用已知 PDF 2x8 阈值,所以报告整体是 `FAIL`
3. `broadcast/reducescatter/allgather/sendrecv` 当前只能证明“多机 2x8 能跑、正确性为 0 wrong、走 IB/GDRDMA”还不能证明生产性能达标因为手头 PDF matrix 没给这 4 项跨节点阈值。
4. 新增 4 项的带宽大致呈现两个层次:
- `broadcast/reducescatter/allgather``191-193 GB/s`,接近当前 4 x 400G rail 的单向原始上限。
- `sendrecv` 只有 `26.98 GB/s`,需要结合 sendrecv 的 traffic pattern 单独解读,不能直接和 allreduce busbw 混比。
## 校验信息
```text
06c565281813c4260da9cfee8f0b0289b61b3be95c01dd670c71fa1a441133e3 reports/multinode_nccl_all_collectives_20260523_120144.md
020eb35ddc5933da78b5c00c1b6fc25b11b23c4505300276d9736fbe8a35519b reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
47f68b7510df3b472e7ac0ec2fb53dcefbe687bb4de0c889f8947cc652d09e61 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
fa2828cdfcb86e6715a17c8bf45de10ce421c12f0877efff9bafb218b2f00df3 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
077fec1bf498fd202e2866f1cf6fb4502ac8d1bafba156f213453b21f6a6df2b reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
be24943eb4b63e304cee41831adeb23ffbbc0e890ff19b067e06d6a4b48b2d90 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
4560364922a85d21827357b906491aae8283c6148ff1c0e0f0dc379a68307fdd reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
```

View File

@ -16,6 +16,7 @@
| 正式 PDF matrix 已复跑 | `reports_multinode_nccl_pdf_matrix_20260523_113803.md`,所有 case 正确性通过;除 2x2 allreduce 外,性能阈值仍 FAIL |
| 原始 artifacts 已归档 | `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`,每个 case 有完整 `cmd/stdout/stderr/json` |
| artifacts 信号已分析 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md`,确认所有 case 都走 IB/GDRDMA 和 4 条 400G HCA未见 SHARP/CollNet |
| 多机六项 collective 已补测 | `reports_multinode_nccl_all_collectives_run_20260523.md`2x8 下 6 项均正确性通过allreduce/alltoall 按 PDF 阈值仍 FAIL |
| 没看到硬错误 | 未见 discard、RoCE retrans、slow restart、packet sequence error 等增长 |
| 当前缺外部 NCCL 网络组件 | 未找到 `libnccl-net*.so*` / `libsharp*.so*`,未见 SHARP/HCOLL 包 |
@ -140,6 +141,15 @@ cd /root/test_gpu_scripts
bash scripts/run_multinode_nccl_pdf_matrix.sh
```
### 多机多卡 2x8 六项 collective 补测
```bash
cd /root/test_gpu_scripts
bash scripts/run_multinode_nccl_all_collectives.sh
```
说明:这个入口用于补齐单机 `test all` 中已有、但多机 PDF matrix 还没覆盖的 NCCL collective。已知 PDF 2x8 阈值仍用于 `allreduce/alltoall`;新增的 `broadcast/reducescatter/allgather/sendrecv` 暂作为证据采集项,不强行套 PDF allreduce/alltoall 阈值。
### 完整深度诊断
```bash
@ -173,6 +183,8 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
| `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新多机多卡 PDF matrix 中文摘要 |
| `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | 最新 artifacts manifest 和 checksum |
| `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 |
| `reports_multinode_nccl_all_collectives_20260523_120144.md` | 最新多机多卡 2x8 六项 collective 原始报告 |
| `reports_multinode_nccl_all_collectives_run_20260523.md` | 最新多机多卡 2x8 六项 collective 中文摘要 |
| `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮深度复跑结果 |
| `reports_multinode_nccl_environment_gap_20260523.md` | 硬件/软件环境等价性缺口 |
| `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail/counter 证据 |
@ -182,7 +194,9 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
| `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 |
| `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 |
| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口;复跑时额外归档每个 case 的完整 `cmd/stdout/stderr/json` |
| `scripts/run_multinode_nccl_all_collectives.sh` | 多机多卡 2x8 六项 collective 补测入口;复跑时额外归档每个 case 的完整 `cmd/stdout/stderr/json` |
| `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置 |
| `configs/multinode_nccl_nccl227_all_collectives_2x8.yaml` | 多机多卡 2x8 六项 collective 补测配置 |
## 当前建议

View File

@ -8,6 +8,7 @@
- 2026-05-23 `11:38` 已完成带 artifacts 的正式多机多卡 PDF matrix 复跑,原始报告为 `reports_multinode_nccl_pdf_matrix_20260523_113803.md`,中文结论为 `reports_multinode_nccl_pdf_matrix_run_20260523.md`artifact manifest 为 `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`
- 已补充 artifacts 信号分析:`reports_multinode_nccl_artifact_signal_analysis_20260523.md`。结论是所有 case 都走 `IB`,都使用 `mlx5_0,mlx5_1,mlx5_6,mlx5_7`,都有 GDRDMA 信号,但没有 SHARP/CollNet/外部 NCCL net plugin 证据。
- 已补充并实跑多机多卡 2x8 六项 collective`reports_multinode_nccl_all_collectives_run_20260523.md`。新增 `broadcast/reducescatter/allgather/sendrecv``returncode=0``wrong=0`、走 `IB/GDRDMA`;已知 PDF 阈值项 `allreduce/alltoall` 仍 FAIL。
- 2 机 1/2/4 GPU per node 档位已接近 PDF 参考值,但严格按阈值仍 FAIL。
- 2 机 8 GPU 档位仍未达到 PDF 参考值:
- allreduce 实测 `353.85 GB/s busbw`PDF 目标 `491.84 GB/s`
@ -22,8 +23,9 @@
| 1 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划,包含决策树、要问的问题和复跑命令 |
| 2 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价,重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP |
| 3 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 信号分析,确认 IB/GDRDMA/HCA 使用情况和 plugin/SHARP 缺口 |
| 4 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
| 5 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果,包含 counter、GRAPH、PXN sweep |
| 4 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机多卡 2x8 六项 collective 补测结果,补齐单机 test all 的 NCCL 覆盖面 |
| 5 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
| 6 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果,包含 counter、GRAPH、PXN sweep |
## 关键脚本
@ -33,7 +35,9 @@
| `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本,不启动 NCCL workload |
| `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口,默认同时采环境快照 |
| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口,跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall并归档每个 case 的 command/stdout/stderr/parsed JSON |
| `scripts/run_multinode_nccl_all_collectives.sh` | 多机多卡 2x8 六项 collective 补测入口,跑 allreduce/alltoall/broadcast/reducescatter/allgather/sendrecv并归档每个 case |
| `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置,固定 NCCL 2.27.7 和 `/data/nccl-tests-latest/build` |
| `configs/multinode_nccl_nccl227_all_collectives_2x8.yaml` | 多机多卡 2x8 六项 collective 补测配置allreduce/alltoall 保留 PDF 阈值,新增 4 项暂按证据采集 |
| `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |
多机多卡 PDF 矩阵:
@ -43,6 +47,13 @@ cd /root/test_gpu_scripts
bash scripts/run_multinode_nccl_pdf_matrix.sh
```
多机多卡 2x8 六项 collective 补测:
```bash
cd /root/test_gpu_scripts
bash scripts/run_multinode_nccl_all_collectives.sh
```
单节点 H100 原始 all 报告:
```bash
@ -88,6 +99,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
/root/test_gpu_scripts/reports_multinode_nccl_handoff_plan_20260523.md
/root/test_gpu_scripts/reports_multinode_nccl_environment_gap_20260523.md
/root/test_gpu_scripts/reports_multinode_nccl_artifact_signal_analysis_20260523.md
/root/test_gpu_scripts/reports_multinode_nccl_all_collectives_run_20260523.md
/root/test_gpu_scripts/reports_multinode_nccl_deep_diagnose_run_20260523.md
```
@ -123,6 +135,15 @@ summary: reports_multinode_nccl_pdf_matrix_run_20260523.md
manifest: reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md
```
最新多机多卡 2x8 六项 collective 补测:
```text
aikubeworker0012: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144.md
artifacts: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts
local copy: reports_multinode_nccl_all_collectives_20260523_120144.md
summary: reports_multinode_nccl_all_collectives_run_20260523.md
```
下一次用 `scripts/run_multinode_nccl_pdf_matrix.sh` 复跑时,还会生成:
```text
@ -131,6 +152,14 @@ manifest: reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.m
目录内按 case 保存完整 `cmd/stdout/stderr/json`,用于给网络/硬件侧复核原始 NCCL 输出。
下一次用 `scripts/run_multinode_nccl_all_collectives.sh` 补测时,还会生成:
```text
/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_YYYYMMDD_HHMMSS_artifacts/
```
目录内按 6 个 collective 保存完整 `cmd/stdout/stderr/json`。该入口用于补齐单节点 `test all` 中已有、但多机 PDF matrix 未覆盖的 `broadcast/reducescatter/allgather/sendrecv` 证据;已知 PDF 2x8 阈值仍用于 `allreduce/alltoall`
## 当前证据摘要
### HCA / rail
@ -200,6 +229,8 @@ PXN disabled sweep 未发现有效参数:
| `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式 PDF matrix 中文摘要 |
| `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | 最新 artifacts manifest 和 checksum |
| `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 |
| `reports_multinode_nccl_all_collectives_20260523_120144.md` | 最新多机多卡 2x8 六项 collective 原始报告 |
| `reports_multinode_nccl_all_collectives_run_20260523.md` | 最新多机多卡 2x8 六项 collective 中文摘要 |
| `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail 和 counter 证据 |
| `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall PXN 和参数 sweep 结论 |
| `reports_rdma_single_node_summary.md` | 单节点 RDMA/HCA 速率摘要 |

View File

@ -0,0 +1,147 @@
#!/usr/bin/env bash
set -uo pipefail
# Run a two-node, eight-GPU-per-node NCCL evidence pass across the six
# collectives used by the single-node H100 acceptance flow.
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml}"
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
FORMAT="${FORMAT:-md}"
DRY_RUN=0
RUN_PREFLIGHT=1
PREFLIGHT_ONLY=0
usage() {
cat <<'EOF'
Usage: run_multinode_nccl_all_collectives.sh [options]
Options:
--python PATH Python executable (default: /root/gpu-test-venv/bin/python)
--config PATH Config file (default: configs/multinode_nccl_nccl227_all_collectives_2x8.yaml)
--out-dir PATH Report output directory (default: reports)
--format FORMAT Report format: md, json, or html (default: md)
--no-preflight Skip scripts/multinode_nccl_deep_diagnose.sh preflight
--preflight-only Run only the preflight check, not the workload
--dry-run Print commands without running them
-h, --help Show this help
EOF
}
while (($#)); do
case "$1" in
--python)
PYTHON_BIN="$2"
shift 2
;;
--config)
CONFIG_FILE="$2"
shift 2
;;
--out-dir)
OUT_DIR="$2"
shift 2
;;
--format)
FORMAT="$2"
shift 2
;;
--no-preflight)
RUN_PREFLIGHT=0
shift
;;
--preflight-only)
PREFLIGHT_ONLY=1
shift
;;
--dry-run)
DRY_RUN=1
shift
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
echo "Unsupported format: $FORMAT" >&2
exit 2
fi
if [[ ! -x "$PYTHON_BIN" ]]; then
PYTHON_BIN="$(command -v python3 || true)"
fi
if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
echo "Python executable not found. Set --python or PYTHON_BIN." >&2
exit 1
fi
TS="$(date +%Y%m%d_%H%M%S)"
mkdir -p "$OUT_DIR"
REPORT_FILE="$OUT_DIR/multinode_nccl_all_collectives_${TS}.${FORMAT}"
ARTIFACT_DIR="$OUT_DIR/multinode_nccl_all_collectives_${TS}_artifacts"
PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight)
RUN_CMD=(
"$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
--config "$CONFIG_FILE"
--test multinode-nccl
--report
--format "$FORMAT"
--output "$REPORT_FILE"
)
echo "Project: $PROJECT_DIR"
echo "Config: $CONFIG_FILE"
echo "Report: $REPORT_FILE"
echo "Artifacts: $ARTIFACT_DIR"
echo "Collectives: allreduce, alltoall, broadcast, reducescatter, allgather, sendrecv"
echo "Topology: 2 nodes x 8 GPUs per node; 16G"
if ((DRY_RUN)); then
if ((RUN_PREFLIGHT)); then
printf 'DRY RUN preflight:'
printf ' %q' "${PREFLIGHT_CMD[@]}"
printf '\n'
fi
if ((PREFLIGHT_ONLY)); then
exit 0
fi
printf 'DRY RUN workload:'
printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR"
printf ' %q' "${RUN_CMD[@]}"
printf '\n'
exit 0
fi
if ((RUN_PREFLIGHT)); then
"${PREFLIGHT_CMD[@]}"
preflight_status=$?
if ((preflight_status != 0)); then
echo "Preflight failed with exit code $preflight_status" >&2
exit "$preflight_status"
fi
fi
if ((PREFLIGHT_ONLY)); then
exit 0
fi
mkdir -p "$ARTIFACT_DIR"
MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${RUN_CMD[@]}"
status=$?
echo "Report written to: $REPORT_FILE"
echo "Artifacts written to: $ARTIFACT_DIR"
exit "$status"