Add multinode NCCL all collectives run
This commit is contained in:
parent
59621d26a0
commit
f5699bf85a
72
configs/multinode_nccl_nccl227_all_collectives_2x8.yaml
Normal file
72
configs/multinode_nccl_nccl227_all_collectives_2x8.yaml
Normal file
@ -0,0 +1,72 @@
|
||||
tools:
|
||||
install_dir: /opt/gpu-test-tools
|
||||
|
||||
report:
|
||||
output_dir: ./reports
|
||||
format: md
|
||||
|
||||
multinode_nccl:
|
||||
enabled: true
|
||||
mode: cross-leaf-all-collectives-nccl-2.27.7
|
||||
hosts:
|
||||
- name: nccl-gpu-1
|
||||
addr: 172.72.8.12
|
||||
slots: 8
|
||||
- name: nccl-gpu-2
|
||||
addr: 172.72.8.16
|
||||
slots: 8
|
||||
ssh_user: root
|
||||
ssh_preflight: true
|
||||
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
|
||||
mpi_ld_preload: null
|
||||
extra_ld_library_path:
|
||||
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||||
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
||||
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||||
nccl_tests_dir: /data/nccl-tests-latest/build
|
||||
tests:
|
||||
- all_reduce_perf
|
||||
- alltoall_perf
|
||||
- broadcast_perf
|
||||
- reduce_scatter_perf
|
||||
- all_gather_perf
|
||||
- sendrecv_perf
|
||||
topologies:
|
||||
- nodes: 2
|
||||
gpus_per_node: 8
|
||||
label: 2 nodes x 8 GPUs (all collectives evidence run)
|
||||
op_env:
|
||||
alltoall:
|
||||
NCCL_PXN_DISABLE: 1
|
||||
begin_size: 16G
|
||||
end_size: 16G
|
||||
step_factor: 2
|
||||
warmup_iters: 10
|
||||
gpus_per_rank: 1
|
||||
timeout_sec: 1800
|
||||
debug: INFO
|
||||
socket_ifname: bond0
|
||||
oob_tcp_ifname: bond0
|
||||
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
|
||||
ib_gid_index: 3
|
||||
ib_sl: 5
|
||||
ib_tc: 136
|
||||
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
|
||||
ib_timeout: 22
|
||||
qps_per_connection: null
|
||||
min_nchannels: null
|
||||
net_plugin: none
|
||||
nvls_enable: 1
|
||||
split_data_on_qps: null
|
||||
extra_env:
|
||||
NCCL_DEBUG_SUBSYS: INIT,NET
|
||||
NCCL_NET_GDR_LEVEL: 5
|
||||
NCCL_NET_GDR_READ: 1
|
||||
NCCL_DMABUF_ENABLE: 0
|
||||
min_peak_busbw_gbps:
|
||||
allreduce: 491.84
|
||||
alltoall: 76.54
|
||||
broadcast: 0
|
||||
reducescatter: 0
|
||||
allgather: 0
|
||||
sendrecv: 0
|
||||
@ -18,14 +18,29 @@ _TEST_ALIASES = {
|
||||
"allreduce": "all_reduce_perf",
|
||||
"all_reduce": "all_reduce_perf",
|
||||
"all_reduce_perf": "all_reduce_perf",
|
||||
"allgather": "all_gather_perf",
|
||||
"all_gather": "all_gather_perf",
|
||||
"all_gather_perf": "all_gather_perf",
|
||||
"alltoall": "alltoall_perf",
|
||||
"all_to_all": "alltoall_perf",
|
||||
"alltoall_perf": "alltoall_perf",
|
||||
"broadcast": "broadcast_perf",
|
||||
"broadcast_perf": "broadcast_perf",
|
||||
"reducescatter": "reduce_scatter_perf",
|
||||
"reduce_scatter": "reduce_scatter_perf",
|
||||
"reduce_scatter_perf": "reduce_scatter_perf",
|
||||
"sendrecv": "sendrecv_perf",
|
||||
"send_recv": "sendrecv_perf",
|
||||
"sendrecv_perf": "sendrecv_perf",
|
||||
}
|
||||
|
||||
_OP_LABELS = {
|
||||
"all_reduce_perf": "allreduce",
|
||||
"all_gather_perf": "allgather",
|
||||
"alltoall_perf": "alltoall",
|
||||
"broadcast_perf": "broadcast",
|
||||
"reduce_scatter_perf": "reducescatter",
|
||||
"sendrecv_perf": "sendrecv",
|
||||
}
|
||||
|
||||
|
||||
|
||||
98
reports_multinode_nccl_all_collectives_20260523_120144.md
Normal file
98
reports_multinode_nccl_all_collectives_20260523_120144.md
Normal file
@ -0,0 +1,98 @@
|
||||
# GPU Test Report
|
||||
|
||||
- **Date:** 2026-05-23T12:04:48.257734
|
||||
- **Host:** aikubeworker0012
|
||||
|
||||
## Overall Acceptance Verdict
|
||||
|
||||
**Result: FAIL**
|
||||
|
||||
Failed or unverified items:
|
||||
- Multi-node NCCL: FAIL
|
||||
|
||||
## Summary
|
||||
|
||||
| Test | Result |
|
||||
|------|--------|
|
||||
| Multi-node NCCL | FAIL |
|
||||
|
||||
## Multi-node NCCL / Cross Leaf
|
||||
|
||||
Source: nccl-tests-mpirun | Mode: cross-leaf-all-collectives-nccl-2.27.7
|
||||
|
||||
- **Artifacts:** `/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts`
|
||||
- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
|
||||
- **Preflight:** PASS
|
||||
|
||||
### Multi-node NCCL allreduce
|
||||
|
||||
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|
||||
|----------|----------------------|-------------|-----------|------------|-----------|--------|
|
||||
| 2 nodes x 8 GPUs (all collectives evidence run) | - | 354.27 GB/s | 16G | 354.45 GB/s | >= 491.84 GB/s | FAIL |
|
||||
|
||||
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|
||||
|----------|--------------|-----------------|------------------|-------------------|
|
||||
| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
|
||||
|
||||
| Topology | Return Code | Error / Output Tail |
|
||||
|----------|-------------|---------------------|
|
||||
| 2 nodes x 8 GPUs (all collectives evidence run) | 0 | nks 16 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0012:2208791:2208941 [0] NCCL INFO comm 0x557970d9f5f0 rank 0 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth : 354.452 # |
|
||||
|
||||
### Multi-node NCCL alltoall
|
||||
|
||||
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|
||||
|----------|----------------------|-------------|-----------|------------|-----------|--------|
|
||||
| 2 nodes x 8 GPUs (all collectives evidence run) | - | 37.00 GB/s | 16G | 37.14 GB/s | >= 76.54 GB/s | FAIL |
|
||||
|
||||
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|
||||
|----------|--------------|-----------------|------------------|-------------------|
|
||||
| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
|
||||
|
||||
| Topology | Return Code | Error / Output Tail |
|
||||
|----------|-------------|---------------------|
|
||||
| 2 nodes x 8 GPUs (all collectives evidence run) | 0 | r0012:2208962:2209141 [5] NCCL INFO comm 0x564c4f9c4a30 rank 5 nranks 16 cudaDev 5 busId ab000 - Destroy COMPLETE aikubeworker0012:2208963:2209143 [6] NCCL INFO comm 0x56328e52f270 rank 6 nranks 16 cudaDev 6 busId ba000 - Destroy COMPLETE |
|
||||
|
||||
### Multi-node NCCL broadcast
|
||||
|
||||
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|
||||
|----------|----------------------|-------------|-----------|------------|-----------|--------|
|
||||
| 2 nodes x 8 GPUs (all collectives evidence run) | - | 191.65 GB/s | 16G | 190.25 GB/s | - | PASS |
|
||||
|
||||
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|
||||
|----------|--------------|-----------------|------------------|-------------------|
|
||||
| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
|
||||
|
||||
### Multi-node NCCL reducescatter
|
||||
|
||||
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|
||||
|----------|----------------------|-------------|-----------|------------|-----------|--------|
|
||||
| 2 nodes x 8 GPUs (all collectives evidence run) | - | 192.75 GB/s | 16G | 192.74 GB/s | - | PASS |
|
||||
|
||||
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|
||||
|----------|--------------|-----------------|------------------|-------------------|
|
||||
| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
|
||||
|
||||
### Multi-node NCCL allgather
|
||||
|
||||
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|
||||
|----------|----------------------|-------------|-----------|------------|-----------|--------|
|
||||
| 2 nodes x 8 GPUs (all collectives evidence run) | - | 192.14 GB/s | 16G | 192.47 GB/s | - | PASS |
|
||||
|
||||
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|
||||
|----------|--------------|-----------------|------------------|-------------------|
|
||||
| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
|
||||
|
||||
### Multi-node NCCL sendrecv
|
||||
|
||||
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|
||||
|----------|----------------------|-------------|-----------|------------|-----------|--------|
|
||||
| 2 nodes x 8 GPUs (all collectives evidence run) | - | 26.98 GB/s | 16G | 26.97 GB/s | - | PASS |
|
||||
|
||||
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|
||||
|----------|--------------|-----------------|------------------|-------------------|
|
||||
| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
|
||||
|
||||
**Overall: FAIL**
|
||||
|
||||
---
|
||||
*Generated by GPU Test Suite v0.2.0*
|
||||
49
reports_multinode_nccl_all_collectives_run_20260523.md
Normal file
49
reports_multinode_nccl_all_collectives_run_20260523.md
Normal file
@ -0,0 +1,49 @@
|
||||
# 多机多卡 NCCL 六项 Collective 补测结果 2026-05-23
|
||||
|
||||
## 测试对象
|
||||
|
||||
- 节点:`nccl-gpu-1(172.72.8.12)` + `nccl-gpu-2(172.72.8.16)`
|
||||
- 拓扑:`2 nodes x 8 GPUs`
|
||||
- NCCL:`2.27.7`
|
||||
- nccl-tests:`/data/nccl-tests-latest/build`
|
||||
- 配置:`configs/multinode_nccl_nccl227_all_collectives_2x8.yaml`
|
||||
- 入口:`scripts/run_multinode_nccl_all_collectives.sh`
|
||||
- 远端报告:`/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144.md`
|
||||
- 远端 artifacts:`/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts`
|
||||
- 本地报告:`reports_multinode_nccl_all_collectives_20260523_120144.md`
|
||||
|
||||
## 一句话结论
|
||||
|
||||
这次补测已经把单机 `test all` 中的 6 个 NCCL collective 扩展到了多机 2x8 场景:`allreduce/alltoall/broadcast/reducescatter/allgather/sendrecv` 都能跑通,`returncode=0`、`wrong_count=0`,并且都走 `IB + GDRDMA`。按已知 PDF 2x8 阈值,`allreduce` 和 `alltoall` 仍 FAIL;新增的 4 项目前没有 PDF 跨节点阈值,因此只作为证据采集项,不判生产验收性能。
|
||||
|
||||
## 结果表
|
||||
|
||||
| Operation | Peak Bus BW | Threshold | Correctness | Network | Status |
|
||||
|---|---:|---:|---|---|---|
|
||||
| allreduce | `354.27 GB/s` | `>= 491.84 GB/s` | `wrong=0` | `IB/GDRDMA` | FAIL |
|
||||
| alltoall | `37.00 GB/s` | `>= 76.54 GB/s` | `wrong=0` | `IB/GDRDMA` | FAIL |
|
||||
| broadcast | `191.65 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
|
||||
| reducescatter | `192.75 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
|
||||
| allgather | `192.14 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
|
||||
| sendrecv | `26.98 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
|
||||
|
||||
## 怎么解读
|
||||
|
||||
1. 这次不是替代 PDF matrix,而是补齐多机多卡 collective 覆盖面。
|
||||
2. `allreduce/alltoall` 继续沿用已知 PDF 2x8 阈值,所以报告整体是 `FAIL`。
|
||||
3. `broadcast/reducescatter/allgather/sendrecv` 当前只能证明“多机 2x8 能跑、正确性为 0 wrong、走 IB/GDRDMA”,还不能证明生产性能达标,因为手头 PDF matrix 没给这 4 项跨节点阈值。
|
||||
4. 新增 4 项的带宽大致呈现两个层次:
|
||||
- `broadcast/reducescatter/allgather` 在 `191-193 GB/s`,接近当前 4 x 400G rail 的单向原始上限。
|
||||
- `sendrecv` 只有 `26.98 GB/s`,需要结合 sendrecv 的 traffic pattern 单独解读,不能直接和 allreduce busbw 混比。
|
||||
|
||||
## 校验信息
|
||||
|
||||
```text
|
||||
06c565281813c4260da9cfee8f0b0289b61b3be95c01dd670c71fa1a441133e3 reports/multinode_nccl_all_collectives_20260523_120144.md
|
||||
020eb35ddc5933da78b5c00c1b6fc25b11b23c4505300276d9736fbe8a35519b reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
|
||||
47f68b7510df3b472e7ac0ec2fb53dcefbe687bb4de0c889f8947cc652d09e61 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
|
||||
fa2828cdfcb86e6715a17c8bf45de10ce421c12f0877efff9bafb218b2f00df3 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
|
||||
077fec1bf498fd202e2866f1cf6fb4502ac8d1bafba156f213453b21f6a6df2b reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
|
||||
be24943eb4b63e304cee41831adeb23ffbbc0e890ff19b067e06d6a4b48b2d90 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
|
||||
4560364922a85d21827357b906491aae8283c6148ff1c0e0f0dc379a68307fdd reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
|
||||
```
|
||||
@ -16,6 +16,7 @@
|
||||
| 正式 PDF matrix 已复跑 | `reports_multinode_nccl_pdf_matrix_20260523_113803.md`,所有 case 正确性通过;除 2x2 allreduce 外,性能阈值仍 FAIL |
|
||||
| 原始 artifacts 已归档 | `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`,每个 case 有完整 `cmd/stdout/stderr/json` |
|
||||
| artifacts 信号已分析 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md`,确认所有 case 都走 IB/GDRDMA 和 4 条 400G HCA,未见 SHARP/CollNet |
|
||||
| 多机六项 collective 已补测 | `reports_multinode_nccl_all_collectives_run_20260523.md`,2x8 下 6 项均正确性通过,allreduce/alltoall 按 PDF 阈值仍 FAIL |
|
||||
| 没看到硬错误 | 未见 discard、RoCE retrans、slow restart、packet sequence error 等增长 |
|
||||
| 当前缺外部 NCCL 网络组件 | 未找到 `libnccl-net*.so*` / `libsharp*.so*`,未见 SHARP/HCOLL 包 |
|
||||
|
||||
@ -140,6 +141,15 @@ cd /root/test_gpu_scripts
|
||||
bash scripts/run_multinode_nccl_pdf_matrix.sh
|
||||
```
|
||||
|
||||
### 多机多卡 2x8 六项 collective 补测
|
||||
|
||||
```bash
|
||||
cd /root/test_gpu_scripts
|
||||
bash scripts/run_multinode_nccl_all_collectives.sh
|
||||
```
|
||||
|
||||
说明:这个入口用于补齐单机 `test all` 中已有、但多机 PDF matrix 还没覆盖的 NCCL collective。已知 PDF 2x8 阈值仍用于 `allreduce/alltoall`;新增的 `broadcast/reducescatter/allgather/sendrecv` 暂作为证据采集项,不强行套 PDF allreduce/alltoall 阈值。
|
||||
|
||||
### 完整深度诊断
|
||||
|
||||
```bash
|
||||
@ -173,6 +183,8 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
|
||||
| `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新多机多卡 PDF matrix 中文摘要 |
|
||||
| `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | 最新 artifacts manifest 和 checksum |
|
||||
| `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 |
|
||||
| `reports_multinode_nccl_all_collectives_20260523_120144.md` | 最新多机多卡 2x8 六项 collective 原始报告 |
|
||||
| `reports_multinode_nccl_all_collectives_run_20260523.md` | 最新多机多卡 2x8 六项 collective 中文摘要 |
|
||||
| `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮深度复跑结果 |
|
||||
| `reports_multinode_nccl_environment_gap_20260523.md` | 硬件/软件环境等价性缺口 |
|
||||
| `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail/counter 证据 |
|
||||
@ -182,7 +194,9 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
|
||||
| `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 |
|
||||
| `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 |
|
||||
| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口;复跑时额外归档每个 case 的完整 `cmd/stdout/stderr/json` |
|
||||
| `scripts/run_multinode_nccl_all_collectives.sh` | 多机多卡 2x8 六项 collective 补测入口;复跑时额外归档每个 case 的完整 `cmd/stdout/stderr/json` |
|
||||
| `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置 |
|
||||
| `configs/multinode_nccl_nccl227_all_collectives_2x8.yaml` | 多机多卡 2x8 六项 collective 补测配置 |
|
||||
|
||||
## 当前建议
|
||||
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
|
||||
- 2026-05-23 `11:38` 已完成带 artifacts 的正式多机多卡 PDF matrix 复跑,原始报告为 `reports_multinode_nccl_pdf_matrix_20260523_113803.md`,中文结论为 `reports_multinode_nccl_pdf_matrix_run_20260523.md`,artifact manifest 为 `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`。
|
||||
- 已补充 artifacts 信号分析:`reports_multinode_nccl_artifact_signal_analysis_20260523.md`。结论是所有 case 都走 `IB`,都使用 `mlx5_0,mlx5_1,mlx5_6,mlx5_7`,都有 GDRDMA 信号,但没有 SHARP/CollNet/外部 NCCL net plugin 证据。
|
||||
- 已补充并实跑多机多卡 2x8 六项 collective:`reports_multinode_nccl_all_collectives_run_20260523.md`。新增 `broadcast/reducescatter/allgather/sendrecv` 均 `returncode=0`、`wrong=0`、走 `IB/GDRDMA`;已知 PDF 阈值项 `allreduce/alltoall` 仍 FAIL。
|
||||
- 2 机 1/2/4 GPU per node 档位已接近 PDF 参考值,但严格按阈值仍 FAIL。
|
||||
- 2 机 8 GPU 档位仍未达到 PDF 参考值:
|
||||
- allreduce 实测 `353.85 GB/s busbw`,PDF 目标 `491.84 GB/s`。
|
||||
@ -22,8 +23,9 @@
|
||||
| 1 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划,包含决策树、要问的问题和复跑命令 |
|
||||
| 2 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价,重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP |
|
||||
| 3 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 信号分析,确认 IB/GDRDMA/HCA 使用情况和 plugin/SHARP 缺口 |
|
||||
| 4 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
|
||||
| 5 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果,包含 counter、GRAPH、PXN sweep |
|
||||
| 4 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机多卡 2x8 六项 collective 补测结果,补齐单机 test all 的 NCCL 覆盖面 |
|
||||
| 5 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
|
||||
| 6 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果,包含 counter、GRAPH、PXN sweep |
|
||||
|
||||
## 关键脚本
|
||||
|
||||
@ -33,7 +35,9 @@
|
||||
| `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本,不启动 NCCL workload |
|
||||
| `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口,默认同时采环境快照 |
|
||||
| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口,跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall,并归档每个 case 的 command/stdout/stderr/parsed JSON |
|
||||
| `scripts/run_multinode_nccl_all_collectives.sh` | 多机多卡 2x8 六项 collective 补测入口,跑 allreduce/alltoall/broadcast/reducescatter/allgather/sendrecv,并归档每个 case |
|
||||
| `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置,固定 NCCL 2.27.7 和 `/data/nccl-tests-latest/build` |
|
||||
| `configs/multinode_nccl_nccl227_all_collectives_2x8.yaml` | 多机多卡 2x8 六项 collective 补测配置,allreduce/alltoall 保留 PDF 阈值,新增 4 项暂按证据采集 |
|
||||
| `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |
|
||||
|
||||
多机多卡 PDF 矩阵:
|
||||
@ -43,6 +47,13 @@ cd /root/test_gpu_scripts
|
||||
bash scripts/run_multinode_nccl_pdf_matrix.sh
|
||||
```
|
||||
|
||||
多机多卡 2x8 六项 collective 补测:
|
||||
|
||||
```bash
|
||||
cd /root/test_gpu_scripts
|
||||
bash scripts/run_multinode_nccl_all_collectives.sh
|
||||
```
|
||||
|
||||
单节点 H100 原始 all 报告:
|
||||
|
||||
```bash
|
||||
@ -88,6 +99,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
|
||||
/root/test_gpu_scripts/reports_multinode_nccl_handoff_plan_20260523.md
|
||||
/root/test_gpu_scripts/reports_multinode_nccl_environment_gap_20260523.md
|
||||
/root/test_gpu_scripts/reports_multinode_nccl_artifact_signal_analysis_20260523.md
|
||||
/root/test_gpu_scripts/reports_multinode_nccl_all_collectives_run_20260523.md
|
||||
/root/test_gpu_scripts/reports_multinode_nccl_deep_diagnose_run_20260523.md
|
||||
```
|
||||
|
||||
@ -123,6 +135,15 @@ summary: reports_multinode_nccl_pdf_matrix_run_20260523.md
|
||||
manifest: reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md
|
||||
```
|
||||
|
||||
最新多机多卡 2x8 六项 collective 补测:
|
||||
|
||||
```text
|
||||
aikubeworker0012: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144.md
|
||||
artifacts: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts
|
||||
local copy: reports_multinode_nccl_all_collectives_20260523_120144.md
|
||||
summary: reports_multinode_nccl_all_collectives_run_20260523.md
|
||||
```
|
||||
|
||||
下一次用 `scripts/run_multinode_nccl_pdf_matrix.sh` 复跑时,还会生成:
|
||||
|
||||
```text
|
||||
@ -131,6 +152,14 @@ manifest: reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.m
|
||||
|
||||
目录内按 case 保存完整 `cmd/stdout/stderr/json`,用于给网络/硬件侧复核原始 NCCL 输出。
|
||||
|
||||
下一次用 `scripts/run_multinode_nccl_all_collectives.sh` 补测时,还会生成:
|
||||
|
||||
```text
|
||||
/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_YYYYMMDD_HHMMSS_artifacts/
|
||||
```
|
||||
|
||||
目录内按 6 个 collective 保存完整 `cmd/stdout/stderr/json`。该入口用于补齐单节点 `test all` 中已有、但多机 PDF matrix 未覆盖的 `broadcast/reducescatter/allgather/sendrecv` 证据;已知 PDF 2x8 阈值仍用于 `allreduce/alltoall`。
|
||||
|
||||
## 当前证据摘要
|
||||
|
||||
### HCA / rail
|
||||
@ -200,6 +229,8 @@ PXN disabled sweep 未发现有效参数:
|
||||
| `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式 PDF matrix 中文摘要 |
|
||||
| `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | 最新 artifacts manifest 和 checksum |
|
||||
| `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 |
|
||||
| `reports_multinode_nccl_all_collectives_20260523_120144.md` | 最新多机多卡 2x8 六项 collective 原始报告 |
|
||||
| `reports_multinode_nccl_all_collectives_run_20260523.md` | 最新多机多卡 2x8 六项 collective 中文摘要 |
|
||||
| `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail 和 counter 证据 |
|
||||
| `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall PXN 和参数 sweep 结论 |
|
||||
| `reports_rdma_single_node_summary.md` | 单节点 RDMA/HCA 速率摘要 |
|
||||
|
||||
147
scripts/run_multinode_nccl_all_collectives.sh
Executable file
147
scripts/run_multinode_nccl_all_collectives.sh
Executable file
@ -0,0 +1,147 @@
|
||||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
|
||||
# Run a two-node, eight-GPU-per-node NCCL evidence pass across the six
|
||||
# collectives used by the single-node H100 acceptance flow.
|
||||
|
||||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
||||
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
|
||||
|
||||
PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
|
||||
CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml}"
|
||||
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
|
||||
FORMAT="${FORMAT:-md}"
|
||||
DRY_RUN=0
|
||||
RUN_PREFLIGHT=1
|
||||
PREFLIGHT_ONLY=0
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage: run_multinode_nccl_all_collectives.sh [options]
|
||||
|
||||
Options:
|
||||
--python PATH Python executable (default: /root/gpu-test-venv/bin/python)
|
||||
--config PATH Config file (default: configs/multinode_nccl_nccl227_all_collectives_2x8.yaml)
|
||||
--out-dir PATH Report output directory (default: reports)
|
||||
--format FORMAT Report format: md, json, or html (default: md)
|
||||
--no-preflight Skip scripts/multinode_nccl_deep_diagnose.sh preflight
|
||||
--preflight-only Run only the preflight check, not the workload
|
||||
--dry-run Print commands without running them
|
||||
-h, --help Show this help
|
||||
EOF
|
||||
}
|
||||
|
||||
while (($#)); do
|
||||
case "$1" in
|
||||
--python)
|
||||
PYTHON_BIN="$2"
|
||||
shift 2
|
||||
;;
|
||||
--config)
|
||||
CONFIG_FILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--out-dir)
|
||||
OUT_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--format)
|
||||
FORMAT="$2"
|
||||
shift 2
|
||||
;;
|
||||
--no-preflight)
|
||||
RUN_PREFLIGHT=0
|
||||
shift
|
||||
;;
|
||||
--preflight-only)
|
||||
PREFLIGHT_ONLY=1
|
||||
shift
|
||||
;;
|
||||
--dry-run)
|
||||
DRY_RUN=1
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
usage >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
|
||||
echo "Unsupported format: $FORMAT" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [[ ! -x "$PYTHON_BIN" ]]; then
|
||||
PYTHON_BIN="$(command -v python3 || true)"
|
||||
fi
|
||||
|
||||
if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
|
||||
echo "Python executable not found. Set --python or PYTHON_BIN." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
TS="$(date +%Y%m%d_%H%M%S)"
|
||||
mkdir -p "$OUT_DIR"
|
||||
|
||||
REPORT_FILE="$OUT_DIR/multinode_nccl_all_collectives_${TS}.${FORMAT}"
|
||||
ARTIFACT_DIR="$OUT_DIR/multinode_nccl_all_collectives_${TS}_artifacts"
|
||||
PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight)
|
||||
RUN_CMD=(
|
||||
"$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
|
||||
--config "$CONFIG_FILE"
|
||||
--test multinode-nccl
|
||||
--report
|
||||
--format "$FORMAT"
|
||||
--output "$REPORT_FILE"
|
||||
)
|
||||
|
||||
echo "Project: $PROJECT_DIR"
|
||||
echo "Config: $CONFIG_FILE"
|
||||
echo "Report: $REPORT_FILE"
|
||||
echo "Artifacts: $ARTIFACT_DIR"
|
||||
echo "Collectives: allreduce, alltoall, broadcast, reducescatter, allgather, sendrecv"
|
||||
echo "Topology: 2 nodes x 8 GPUs per node; 16G"
|
||||
|
||||
if ((DRY_RUN)); then
|
||||
if ((RUN_PREFLIGHT)); then
|
||||
printf 'DRY RUN preflight:'
|
||||
printf ' %q' "${PREFLIGHT_CMD[@]}"
|
||||
printf '\n'
|
||||
fi
|
||||
if ((PREFLIGHT_ONLY)); then
|
||||
exit 0
|
||||
fi
|
||||
printf 'DRY RUN workload:'
|
||||
printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR"
|
||||
printf ' %q' "${RUN_CMD[@]}"
|
||||
printf '\n'
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if ((RUN_PREFLIGHT)); then
|
||||
"${PREFLIGHT_CMD[@]}"
|
||||
preflight_status=$?
|
||||
if ((preflight_status != 0)); then
|
||||
echo "Preflight failed with exit code $preflight_status" >&2
|
||||
exit "$preflight_status"
|
||||
fi
|
||||
fi
|
||||
|
||||
if ((PREFLIGHT_ONLY)); then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "$ARTIFACT_DIR"
|
||||
MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${RUN_CMD[@]}"
|
||||
status=$?
|
||||
|
||||
echo "Report written to: $REPORT_FILE"
|
||||
echo "Artifacts written to: $ARTIFACT_DIR"
|
||||
exit "$status"
|
||||
Loading…
x
Reference in New Issue
Block a user