Add NCCL PDF matrix topology report
This commit is contained in:
parent
519de86553
commit
49d358c0ca
88
configs/multinode_nccl_nccl227_pdf_matrix.yaml
Normal file
88
configs/multinode_nccl_nccl227_pdf_matrix.yaml
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
tools:
|
||||||
|
install_dir: /opt/gpu-test-tools
|
||||||
|
|
||||||
|
report:
|
||||||
|
output_dir: ./reports
|
||||||
|
format: md
|
||||||
|
|
||||||
|
multinode_nccl:
|
||||||
|
enabled: true
|
||||||
|
mode: cross-leaf-pdf-matrix-nccl-2.27.7
|
||||||
|
hosts:
|
||||||
|
- name: nccl-gpu-1
|
||||||
|
addr: 172.72.8.12
|
||||||
|
slots: 8
|
||||||
|
- name: nccl-gpu-2
|
||||||
|
addr: 172.72.8.16
|
||||||
|
slots: 8
|
||||||
|
ssh_user: root
|
||||||
|
ssh_preflight: true
|
||||||
|
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
|
||||||
|
mpi_ld_preload: null
|
||||||
|
extra_ld_library_path:
|
||||||
|
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||||||
|
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
||||||
|
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||||||
|
nccl_tests_dir: null
|
||||||
|
tests:
|
||||||
|
- all_reduce_perf
|
||||||
|
- alltoall_perf
|
||||||
|
topologies:
|
||||||
|
- nodes: 2
|
||||||
|
gpus_per_node: 1
|
||||||
|
label: 2 nodes x 1 GPU (PDF 2 machines 2 GPUs)
|
||||||
|
min_peak_busbw_gbps:
|
||||||
|
allreduce: 48.90
|
||||||
|
alltoall: 27.25
|
||||||
|
- nodes: 2
|
||||||
|
gpus_per_node: 2
|
||||||
|
label: 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs)
|
||||||
|
min_peak_busbw_gbps:
|
||||||
|
allreduce: 136.93
|
||||||
|
alltoall: 54.41
|
||||||
|
- nodes: 2
|
||||||
|
gpus_per_node: 4
|
||||||
|
label: 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs)
|
||||||
|
cuda_visible_devices: 0,1,4,5
|
||||||
|
op_env:
|
||||||
|
alltoall:
|
||||||
|
NCCL_IB_QPS_PER_CONNECTION: 4
|
||||||
|
NCCL_MIN_NCHANNELS: 4
|
||||||
|
NCCL_IB_SPLIT_DATA_ON_QPS: 1
|
||||||
|
min_peak_busbw_gbps:
|
||||||
|
allreduce: 335.48
|
||||||
|
alltoall: 73.73
|
||||||
|
- nodes: 2
|
||||||
|
gpus_per_node: 8
|
||||||
|
label: 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs)
|
||||||
|
min_peak_busbw_gbps:
|
||||||
|
allreduce: 491.84
|
||||||
|
alltoall: 76.54
|
||||||
|
begin_size: 16G
|
||||||
|
end_size: 16G
|
||||||
|
step_factor: 2
|
||||||
|
warmup_iters: 10
|
||||||
|
gpus_per_rank: 1
|
||||||
|
timeout_sec: 1800
|
||||||
|
debug: INFO
|
||||||
|
socket_ifname: bond0
|
||||||
|
oob_tcp_ifname: bond0
|
||||||
|
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
|
||||||
|
ib_gid_index: 3
|
||||||
|
ib_sl: 5
|
||||||
|
ib_tc: 136
|
||||||
|
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
|
||||||
|
ib_timeout: 22
|
||||||
|
qps_per_connection: null
|
||||||
|
min_nchannels: null
|
||||||
|
net_plugin: none
|
||||||
|
nvls_enable: 1
|
||||||
|
split_data_on_qps: null
|
||||||
|
extra_env:
|
||||||
|
NCCL_DEBUG_SUBSYS: INIT,NET
|
||||||
|
NCCL_NET_GDR_LEVEL: 5
|
||||||
|
NCCL_NET_GDR_READ: 1
|
||||||
|
NCCL_DMABUF_ENABLE: 0
|
||||||
|
min_peak_busbw_gbps:
|
||||||
|
allreduce: 0
|
||||||
|
alltoall: 0
|
||||||
@ -95,10 +95,14 @@ class MultiNodeNCCLTest:
|
|||||||
"nodes": nodes,
|
"nodes": nodes,
|
||||||
"gpus_per_node": gpus_per_node,
|
"gpus_per_node": gpus_per_node,
|
||||||
"label": topo.get("label") or f"{nodes} nodes x {gpus_per_node} GPUs",
|
"label": topo.get("label") or f"{nodes} nodes x {gpus_per_node} GPUs",
|
||||||
|
"cuda_visible_devices": topo.get("cuda_visible_devices"),
|
||||||
|
"env": topo.get("env") or {},
|
||||||
|
"op_env": topo.get("op_env") or topo.get("test_env") or {},
|
||||||
|
"min_peak_busbw_gbps": topo.get("min_peak_busbw_gbps"),
|
||||||
})
|
})
|
||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
def _env_exports(self) -> list[tuple[str, str]]:
|
def _env_exports(self, topo: dict = None, label: str = None, binary: str = None) -> list[tuple[str, str]]:
|
||||||
env_cfg = {
|
env_cfg = {
|
||||||
"NCCL_DEBUG": self.cfg.get("debug", "WARN"),
|
"NCCL_DEBUG": self.cfg.get("debug", "WARN"),
|
||||||
"NCCL_SOCKET_IFNAME": self.cfg.get("socket_ifname"),
|
"NCCL_SOCKET_IFNAME": self.cfg.get("socket_ifname"),
|
||||||
@ -124,11 +128,29 @@ class MultiNodeNCCLTest:
|
|||||||
)
|
)
|
||||||
extra_env = self.cfg.get("extra_env") or {}
|
extra_env = self.cfg.get("extra_env") or {}
|
||||||
if isinstance(extra_env, dict):
|
if isinstance(extra_env, dict):
|
||||||
for key, value in extra_env.items():
|
self._merge_env(env_cfg, extra_env)
|
||||||
if value is not None:
|
if topo:
|
||||||
env_cfg[str(key)] = str(value)
|
if topo.get("cuda_visible_devices"):
|
||||||
|
env_cfg["CUDA_VISIBLE_DEVICES"] = str(topo["cuda_visible_devices"])
|
||||||
|
if isinstance(topo.get("env"), dict):
|
||||||
|
self._merge_env(env_cfg, topo["env"])
|
||||||
|
op_env = topo.get("op_env")
|
||||||
|
if isinstance(op_env, dict):
|
||||||
|
for key in (label, binary):
|
||||||
|
overrides = op_env.get(key)
|
||||||
|
if isinstance(overrides, dict):
|
||||||
|
self._merge_env(env_cfg, overrides)
|
||||||
return [(k, str(v)) for k, v in env_cfg.items() if v is not None]
|
return [(k, str(v)) for k, v in env_cfg.items() if v is not None]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _merge_env(env_cfg: dict, overrides: dict):
|
||||||
|
for key, value in overrides.items():
|
||||||
|
key = str(key)
|
||||||
|
if value is None:
|
||||||
|
env_cfg.pop(key, None)
|
||||||
|
else:
|
||||||
|
env_cfg[key] = str(value)
|
||||||
|
|
||||||
def _mpi_ld_preload(self) -> str:
|
def _mpi_ld_preload(self) -> str:
|
||||||
preload = self.cfg.get("mpi_ld_preload")
|
preload = self.cfg.get("mpi_ld_preload")
|
||||||
if isinstance(preload, list):
|
if isinstance(preload, list):
|
||||||
@ -253,7 +275,7 @@ class MultiNodeNCCLTest:
|
|||||||
plm_rsh_args = self.cfg.get("plm_rsh_args")
|
plm_rsh_args = self.cfg.get("plm_rsh_args")
|
||||||
if plm_rsh_args:
|
if plm_rsh_args:
|
||||||
cmd.extend(["--mca", "plm_rsh_args", str(plm_rsh_args)])
|
cmd.extend(["--mca", "plm_rsh_args", str(plm_rsh_args)])
|
||||||
for key, value in self._env_exports():
|
for key, value in self._env_exports(topo=topo, label=label, binary=os.path.basename(binary)):
|
||||||
cmd.extend(["-x", f"{key}={value}"])
|
cmd.extend(["-x", f"{key}={value}"])
|
||||||
|
|
||||||
cmd.extend([
|
cmd.extend([
|
||||||
@ -286,7 +308,7 @@ class MultiNodeNCCLTest:
|
|||||||
|
|
||||||
parsed = self._parse_nccl_output(r.stdout)
|
parsed = self._parse_nccl_output(r.stdout)
|
||||||
net_diag = self._parse_network_diagnostics(r.stdout + "\n" + r.stderr)
|
net_diag = self._parse_network_diagnostics(r.stdout + "\n" + r.stderr)
|
||||||
threshold = self._threshold_for(label)
|
threshold = self._threshold_for(label, topo)
|
||||||
wrong = sum(row.get("wrong", 0) for row in parsed["by_size"])
|
wrong = sum(row.get("wrong", 0) for row in parsed["by_size"])
|
||||||
has_bw = parsed["peak_busbw_gbps"] > 0
|
has_bw = parsed["peak_busbw_gbps"] > 0
|
||||||
status = "PASS" if r.returncode == 0 and has_bw and wrong == 0 and parsed["peak_busbw_gbps"] >= threshold else "FAIL"
|
status = "PASS" if r.returncode == 0 and has_bw and wrong == 0 and parsed["peak_busbw_gbps"] >= threshold else "FAIL"
|
||||||
@ -296,6 +318,7 @@ class MultiNodeNCCLTest:
|
|||||||
"gpus_per_node": gpus_per_node,
|
"gpus_per_node": gpus_per_node,
|
||||||
"ranks": ranks,
|
"ranks": ranks,
|
||||||
"hosts": selected_hosts,
|
"hosts": selected_hosts,
|
||||||
|
"cuda_visible_devices": topo.get("cuda_visible_devices"),
|
||||||
"command": " ".join(cmd),
|
"command": " ".join(cmd),
|
||||||
"returncode": r.returncode,
|
"returncode": r.returncode,
|
||||||
"status": status,
|
"status": status,
|
||||||
@ -313,10 +336,31 @@ class MultiNodeNCCLTest:
|
|||||||
"finished_at": datetime.now().isoformat(),
|
"finished_at": datetime.now().isoformat(),
|
||||||
}
|
}
|
||||||
|
|
||||||
def _threshold_for(self, label: str) -> float:
|
def _threshold_for(self, label: str, topo: dict = None) -> float:
|
||||||
|
if topo and topo.get("min_peak_busbw_gbps") is not None:
|
||||||
|
topo_thresholds = topo.get("min_peak_busbw_gbps")
|
||||||
|
if isinstance(topo_thresholds, dict):
|
||||||
|
return float(topo_thresholds.get(label, 0) or 0)
|
||||||
|
return float(topo_thresholds or 0)
|
||||||
|
|
||||||
thresholds = self.cfg.get("min_peak_busbw_gbps") or {}
|
thresholds = self.cfg.get("min_peak_busbw_gbps") or {}
|
||||||
if isinstance(thresholds, dict):
|
if isinstance(thresholds, dict):
|
||||||
return float(thresholds.get(label, 0) or 0)
|
op_threshold = thresholds.get(label, 0)
|
||||||
|
if isinstance(op_threshold, dict):
|
||||||
|
keys = []
|
||||||
|
if topo:
|
||||||
|
keys.extend([
|
||||||
|
topo.get("label"),
|
||||||
|
f"{topo.get('nodes')}x{topo.get('gpus_per_node')}",
|
||||||
|
f"{topo.get('nodes')} nodes x {topo.get('gpus_per_node')} GPUs",
|
||||||
|
str(topo.get("gpus_per_node")),
|
||||||
|
])
|
||||||
|
keys.append("default")
|
||||||
|
for key in keys:
|
||||||
|
if key in op_threshold:
|
||||||
|
return float(op_threshold.get(key) or 0)
|
||||||
|
return 0.0
|
||||||
|
return float(op_threshold or 0)
|
||||||
return float(thresholds or 0)
|
return float(thresholds or 0)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|||||||
@ -481,13 +481,14 @@ class ReportGenerator:
|
|||||||
lines.append("")
|
lines.append("")
|
||||||
for op, data in (multinode.get("tests") or {}).items():
|
for op, data in (multinode.get("tests") or {}).items():
|
||||||
lines.append(f"### Multi-node NCCL {op}\n")
|
lines.append(f"### Multi-node NCCL {op}\n")
|
||||||
lines.append("| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |")
|
lines.append("| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |")
|
||||||
lines.append("|----------|-------------|-----------|------------|-----------|--------|")
|
lines.append("|----------|----------------------|-------------|-----------|------------|-----------|--------|")
|
||||||
for topo in data.get("topologies", []):
|
for topo in data.get("topologies", []):
|
||||||
threshold = topo.get("min_required_gbps", 0) or 0
|
threshold = topo.get("min_required_gbps", 0) or 0
|
||||||
threshold_text = f">= {threshold:.0f} GB/s" if threshold else "-"
|
threshold_text = f">= {threshold:.0f} GB/s" if threshold else "-"
|
||||||
|
cuda_visible = topo.get("cuda_visible_devices") or "-"
|
||||||
lines.append(
|
lines.append(
|
||||||
f"| {topo.get('label', '')} | {topo.get('peak_busbw_gbps', 0):.2f} GB/s | "
|
f"| {topo.get('label', '')} | {cuda_visible} | {topo.get('peak_busbw_gbps', 0):.2f} GB/s | "
|
||||||
f"{topo.get('peak_size', '')} | {topo.get('avg_busbw_gbps', 0):.2f} GB/s | "
|
f"{topo.get('peak_size', '')} | {topo.get('avg_busbw_gbps', 0):.2f} GB/s | "
|
||||||
f"{threshold_text} | {topo.get('status', '?')} |"
|
f"{threshold_text} | {topo.get('status', '?')} |"
|
||||||
)
|
)
|
||||||
|
|||||||
@ -14,6 +14,8 @@
|
|||||||
|
|
||||||
继续 tuning 后发现,配置里固定的 `NCCL_MIN_NCHANNELS=4`、`NCCL_IB_QPS_PER_CONNECTION=4`、`NCCL_IB_SPLIT_DATA_ON_QPS=1` 会明显压低 16G allreduce。去掉这些固定参数、让 NCCL 2.27.7 自动选择后,正式脚本报告中 2 节点 x 8 GPU allreduce 提升到 `354.60 GB/s`,alltoall 小幅提升到 `30.01 GB/s`。当前剩余问题不再是 GDR disabled,而是 GDR enabled 且 NCCL 自动调参后,仍低于当前配置里的验收阈值。
|
继续 tuning 后发现,配置里固定的 `NCCL_MIN_NCHANNELS=4`、`NCCL_IB_QPS_PER_CONNECTION=4`、`NCCL_IB_SPLIT_DATA_ON_QPS=1` 会明显压低 16G allreduce。去掉这些固定参数、让 NCCL 2.27.7 自动选择后,正式脚本报告中 2 节点 x 8 GPU allreduce 提升到 `354.60 GB/s`,alltoall 小幅提升到 `30.01 GB/s`。当前剩余问题不再是 GDR disabled,而是 GDR enabled 且 NCCL 自动调参后,仍低于当前配置里的验收阈值。
|
||||||
|
|
||||||
|
按 `sx算力节点跨Leaf NCCL测试报告.pdf` 的矩阵继续对齐后,发现 2 机 4 卡档位的核心问题是默认 GPU 选择不符合 GPU-NIC 亲和性。显式选择 `CUDA_VISIBLE_DEVICES=0,1,4,5` 后,2 机 4 卡 allreduce 可以恢复到 `333-335 GB/s` 区间,接近 PDF 的 `335.48 GB/s`;alltoall 配合 PDF 固定 NCCL 参数可到 `72.93 GB/s`,接近 PDF 的 `73.73 GB/s`。但 2 机 8 卡档位仍只有 allreduce `354.02 GB/s`、alltoall `30.04 GB/s`,与 PDF 的 `491.84/76.54 GB/s` 差距明显。
|
||||||
|
|
||||||
同时,`nccl-gpu-2` 的 SSH 入口曾因未认证连接过多触发 `MaxStartups` 随机拒绝,导致 `mpirun` 拉起远端 rank 失败。已经做了临时 SSHD 缓解并拿到有效的 2 节点 x 8 GPU allreduce/alltoall 报告。
|
同时,`nccl-gpu-2` 的 SSH 入口曾因未认证连接过多触发 `MaxStartups` 随机拒绝,导致 `mpirun` 拉起远端 rank 失败。已经做了临时 SSHD 缓解并拿到有效的 2 节点 x 8 GPU allreduce/alltoall 报告。
|
||||||
|
|
||||||
## 已完成的修正
|
## 已完成的修正
|
||||||
@ -29,6 +31,8 @@
|
|||||||
9. 从 NVIDIA apt 源下载但不安装 `libnccl2=2.27.7-1+cuda12.4`,解压到两台机器 `/tmp/nccl-2.27.7-cuda12.4`,用 `LD_LIBRARY_PATH` 临时覆盖 NCCL 运行库验证。
|
9. 从 NVIDIA apt 源下载但不安装 `libnccl2=2.27.7-1+cuda12.4`,解压到两台机器 `/tmp/nccl-2.27.7-cuda12.4`,用 `LD_LIBRARY_PATH` 临时覆盖 NCCL 运行库验证。
|
||||||
10. 增强报告解析,能够区分 `GPU Direct RDMA ENABLED` 和 `DISABLED`,并列出 enabled/disabled HCA。
|
10. 增强报告解析,能够区分 `GPU Direct RDMA ENABLED` 和 `DISABLED`,并列出 enabled/disabled HCA。
|
||||||
11. 将 multi-node NCCL 配置中的 `qps_per_connection`、`min_nchannels`、`split_data_on_qps` 改为 `null`,避免默认导出会压低大包 allreduce 的固定 NCCL 参数。
|
11. 将 multi-node NCCL 配置中的 `qps_per_connection`、`min_nchannels`、`split_data_on_qps` 改为 `null`,避免默认导出会压低大包 allreduce 的固定 NCCL 参数。
|
||||||
|
12. 增加 topology 级 `cuda_visible_devices`、`env`、`op_env` 配置能力,支持按 GPU/NIC 亲和性和不同 NCCL op 分别设置环境变量。
|
||||||
|
13. 生成 PDF 矩阵式原始报告 `reports_multinode_nccl_pdf_matrix_nccl227.md`,覆盖 2 机 1/2/4/8 GPU per node。
|
||||||
|
|
||||||
## 关键证据
|
## 关键证据
|
||||||
|
|
||||||
@ -224,6 +228,50 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
|
|||||||
|
|
||||||
带宽仍约 `13.4 GB/s`。测试后已经恢复默认 `peerdirect_support=0,persistent_api_support=1`。
|
带宽仍约 `13.4 GB/s`。测试后已经恢复默认 `peerdirect_support=0,persistent_api_support=1`。
|
||||||
|
|
||||||
|
### 7. PDF 矩阵对齐与 GPU-NIC 亲和性
|
||||||
|
|
||||||
|
参考 PDF 的跨 Leaf 命令覆盖 2 机 2/4/8/16 卡矩阵,并使用:
|
||||||
|
|
||||||
|
- `NCCL_IB_GID_INDEX=3`
|
||||||
|
- `NCCL_IB_SL=5`
|
||||||
|
- `NCCL_IB_TC=136`
|
||||||
|
- `NCCL_SOCKET_IFNAME=bond0`
|
||||||
|
- `NCCL_IB_TIMEOUT=22`
|
||||||
|
- `NCCL_NET_PLUGIN=none`
|
||||||
|
- `NCCL_NVLS_ENABLE=1`
|
||||||
|
|
||||||
|
本环境与 PDF 参考机器有一个关键硬件差异:当前两台机器只有 `mlx5_0,mlx5_1,mlx5_6,mlx5_7` 是 400Gb/s NDR;`mlx5_4,mlx5_5` 是 100Gb/s HDR;`mlx5_2,mlx5_8` 是 25Gb/s;`mlx5_3,mlx5_9` 为 DOWN。参考 PDF 的命令列出了更多 HCA,但当前节点不能等价使用为 8 条 400G rail。
|
||||||
|
|
||||||
|
`nvidia-smi topo -m` 显示:
|
||||||
|
|
||||||
|
| GPU | 最近的 400G HCA |
|
||||||
|
|-----|-----------------|
|
||||||
|
| GPU0 | `mlx5_0` |
|
||||||
|
| GPU1 | `mlx5_1` |
|
||||||
|
| GPU4 | `mlx5_6` |
|
||||||
|
| GPU5 | `mlx5_7` |
|
||||||
|
|
||||||
|
默认 2 机 4 卡会选择 GPU0/1/2/3,其中 GPU2 最近的是 25G/down 端口,GPU3 没有直接对应 400G rail。因此 2 机 4 卡默认 allreduce 只有约 `168 GB/s`。显式设置 `CUDA_VISIBLE_DEVICES=0,1,4,5` 后:
|
||||||
|
|
||||||
|
| 场景 | allreduce | alltoall | 说明 |
|
||||||
|
|------|-----------|----------|------|
|
||||||
|
| 默认 GPU0/1/2/3 | `167.89 GB/s` | `39.68 GB/s` | GPU/NIC 亲和性错误 |
|
||||||
|
| `CUDA_VISIBLE_DEVICES=0,1,4,5` + auto NCCL | `335.34 GB/s` | `63.90 GB/s` | allreduce 接近 PDF |
|
||||||
|
| `CUDA_VISIBLE_DEVICES=0,1,4,5` + PDF 固定参数 | `225.29 GB/s` | `73.10 GB/s` | alltoall 接近 PDF,但 allreduce 被压低 |
|
||||||
|
|
||||||
|
因此当前脚本支持按 op 配环境变量:4 卡 allreduce 用 auto,4 卡 alltoall 用 PDF 固定参数。
|
||||||
|
|
||||||
|
矩阵式正式报告:`reports_multinode_nccl_pdf_matrix_nccl227.md`
|
||||||
|
|
||||||
|
| Topology | allreduce | PDF Reference | Status | alltoall | PDF Reference | Status |
|
||||||
|
|----------|-----------|---------------|--------|----------|---------------|--------|
|
||||||
|
| 2 nodes x 1 GPU | `47.23 GB/s` | `48.90 GB/s` | FAIL | `24.84 GB/s` | `27.25 GB/s` | FAIL |
|
||||||
|
| 2 nodes x 2 GPUs | `136.97 GB/s` | `136.93 GB/s` | PASS | `47.67 GB/s` | `54.41 GB/s` | FAIL |
|
||||||
|
| 2 nodes x 4 GPUs | `333.22 GB/s` | `335.48 GB/s` | FAIL | `72.93 GB/s` | `73.73 GB/s` | FAIL |
|
||||||
|
| 2 nodes x 8 GPUs | `354.02 GB/s` | `491.84 GB/s` | FAIL | `30.04 GB/s` | `76.54 GB/s` | FAIL |
|
||||||
|
|
||||||
|
解释:2 机 4 卡档位已经基本定位并修复到接近 PDF;2 机 8 卡档位不是简单 GPU 顺序问题。尝试调整 8 卡 `CUDA_VISIBLE_DEVICES` 顺序、加入 100G/25G active HCA、以及套 PDF 固定参数都没有改善;固定参数反而会把 8 卡 allreduce 从约 `354 GB/s` 压到约 `239 GB/s`。
|
||||||
|
|
||||||
## 当前阻塞
|
## 当前阻塞
|
||||||
|
|
||||||
### 阻塞 1:当前生产 NCCL 版本过旧,GDR 被禁用
|
### 阻塞 1:当前生产 NCCL 版本过旧,GDR 被禁用
|
||||||
@ -236,15 +284,18 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
|
|||||||
|
|
||||||
判断:底层 RDMA 能力存在,GDR 禁用主要由旧 NCCL 版本触发。建议正式安装并固定 NCCL 2.27.7+cuda12.4 或更新的已验证版本。
|
判断:底层 RDMA 能力存在,GDR 禁用主要由旧 NCCL 版本触发。建议正式安装并固定 NCCL 2.27.7+cuda12.4 或更新的已验证版本。
|
||||||
|
|
||||||
### 阻塞 2:GDR enabled 且 NCCL 自动调参后带宽仍低于当前阈值
|
### 阻塞 2:2 机 8 GPU 档位仍低于 PDF 参考值
|
||||||
|
|
||||||
现象:
|
现象:
|
||||||
|
|
||||||
- 2x8 16G allreduce:`354.60 GB/s`,阈值 `>= 480 GB/s`
|
- 2x8 16G allreduce:`354.02 GB/s`,PDF 参考 `491.84 GB/s`
|
||||||
- 2x8 16G alltoall:`30.01 GB/s`,阈值 `>= 75 GB/s`
|
- 2x8 16G alltoall:`30.04 GB/s`,PDF 参考 `76.54 GB/s`
|
||||||
- 已使用 4 个 400Gb/s HCA:`mlx5_0, mlx5_1, mlx5_6, mlx5_7`
|
- 已使用 4 个 400Gb/s HCA:`mlx5_0, mlx5_1, mlx5_6, mlx5_7`
|
||||||
|
- 加入 `mlx5_4,mlx5_5` 100G HCA 或 `mlx5_2,mlx5_8` 25G HCA 基本无收益
|
||||||
|
- 调整 8 卡 `CUDA_VISIBLE_DEVICES` 顺序基本无收益
|
||||||
|
- 套 PDF 固定参数会让 8 卡 allreduce 明显变差
|
||||||
|
|
||||||
判断:需要确认当前 PDF/config 阈值是否适用于跨 Leaf 两节点场景;如果阈值确实要求跨 Leaf 也达到这些数值,则还需要继续查链路聚合、多 rail 使用、交换网络、NCCL net plugin/SHARP 或 rail mapping。
|
判断:2 机 8 GPU 档位的剩余差距更像硬件 rail 数量/交换网络/路由/拥塞/NCCL net plugin 能力问题,不再是旧 NCCL GDR disabled 或 4 卡 GPU 选择问题。
|
||||||
|
|
||||||
### 阻塞 3:`nccl-gpu-2` SSH 存在外部连接压力
|
### 阻塞 3:`nccl-gpu-2` SSH 存在外部连接压力
|
||||||
|
|
||||||
@ -261,10 +312,12 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
|
|||||||
|
|
||||||
1. 从网络/安全侧处理 `172.239.10.85` 等来源的 SSH 未认证连接压力,或者保留更高的 `MaxStartups` 配置作为测试窗口临时策略。
|
1. 从网络/安全侧处理 `172.239.10.85` 等来源的 SSH 未认证连接压力,或者保留更高的 `MaxStartups` 配置作为测试窗口临时策略。
|
||||||
2. 正式安装并固定已验证的 NCCL 2.27.7+cuda12.4 或更新版本,不要依赖 pip NCCL 2.21.5;当前 `/tmp/nccl-2.27.7-cuda12.4` 只是临时解压验证。
|
2. 正式安装并固定已验证的 NCCL 2.27.7+cuda12.4 或更新版本,不要依赖 pip NCCL 2.21.5;当前 `/tmp/nccl-2.27.7-cuda12.4` 只是临时解压验证。
|
||||||
3. multi-node NCCL 默认不要固定 `NCCL_MIN_NCHANNELS=4`、`NCCL_IB_QPS_PER_CONNECTION=4`、`NCCL_IB_SPLIT_DATA_ON_QPS=1`;当前脚本配置已改成 `null`,让 NCCL 自动选择。
|
3. 4 卡 per node 测试应显式使用 `CUDA_VISIBLE_DEVICES=0,1,4,5`,避免默认 GPU0/1/2/3 落到错误 GPU/NIC 亲和性。
|
||||||
4. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin/SHARP;当前日志显示 `Could not find: libnccl-net.so`,NCCL 使用的是 internal IB plugin。
|
4. 4 卡 allreduce 建议继续让 NCCL 自动选择 channel/QP;4 卡 alltoall 如果要贴近 PDF,可单独套 `NCCL_IB_QPS_PER_CONNECTION=4`、`NCCL_MIN_NCHANNELS=4`、`NCCL_IB_SPLIT_DATA_ON_QPS=1`。
|
||||||
5. 核对跨 Leaf 链路的 rail mapping、交换机端口速率、路由和拥塞计数,确认 4 个 400Gb/s HCA 是否都在跨节点通信中充分利用。
|
5. 8 卡 per node 不建议套上述固定参数,会降低 allreduce;继续用 auto。
|
||||||
6. 确认当前 `allreduce >= 480 GB/s`、`alltoall >= 75 GB/s` 阈值是否应直接用于跨 Leaf 两节点场景;如果是,继续按链路和 NCCL rail 聚合方向排查。
|
6. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin/SHARP;当前日志显示 `Could not find: libnccl-net.so`,NCCL 使用的是 internal IB plugin。
|
||||||
|
7. 核对跨 Leaf 链路的 rail mapping、交换机端口速率、路由和拥塞计数,确认 4 个 400Gb/s HCA 是否都在跨节点通信中充分利用。
|
||||||
|
8. 确认当前 PDF 的 `491.84/76.54 GB/s` 是否要求当前这两台节点在只有 4 条 400G rail 的形态下也达到;如果要求一致,需要网络/硬件侧继续介入。
|
||||||
|
|
||||||
## 当前可交付物
|
## 当前可交付物
|
||||||
|
|
||||||
@ -273,9 +326,11 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
|
|||||||
- `configs/multinode_nccl_nccl227_sweep.yaml`:NCCL 2.27.7 1M 到 4G sweep 配置
|
- `configs/multinode_nccl_nccl227_sweep.yaml`:NCCL 2.27.7 1M 到 4G sweep 配置
|
||||||
- `configs/multinode_nccl_nccl227_16g.yaml`:NCCL 2.27.7 16G 大包配置
|
- `configs/multinode_nccl_nccl227_16g.yaml`:NCCL 2.27.7 16G 大包配置
|
||||||
- `configs/multinode_nccl_nccl227_auto_16g.yaml`:NCCL 2.27.7 16G 自动 channel/QP 配置
|
- `configs/multinode_nccl_nccl227_auto_16g.yaml`:NCCL 2.27.7 16G 自动 channel/QP 配置
|
||||||
|
- `configs/multinode_nccl_nccl227_pdf_matrix.yaml`:按 PDF 矩阵和 GPU 亲和性优化后的跨 Leaf 配置
|
||||||
- `reports_multinode_nccl_diagnostic_2x8_sshfix.md`:脚本生成的原始 2x8 诊断报告
|
- `reports_multinode_nccl_diagnostic_2x8_sshfix.md`:脚本生成的原始 2x8 诊断报告
|
||||||
- `reports_multinode_nccl_diagnostic_2x8_nccl227_v2.md`:NCCL 2.27.7 256M 诊断报告
|
- `reports_multinode_nccl_diagnostic_2x8_nccl227_v2.md`:NCCL 2.27.7 256M 诊断报告
|
||||||
- `reports_multinode_nccl_sweep_2x8_nccl227.md`:NCCL 2.27.7 1M 到 4G sweep 报告
|
- `reports_multinode_nccl_sweep_2x8_nccl227.md`:NCCL 2.27.7 1M 到 4G sweep 报告
|
||||||
- `reports_multinode_nccl_16g_2x8_nccl227.md`:NCCL 2.27.7 16G 大包报告
|
- `reports_multinode_nccl_16g_2x8_nccl227.md`:NCCL 2.27.7 16G 大包报告
|
||||||
- `reports_multinode_nccl_16g_2x8_nccl227_auto.md`:NCCL 2.27.7 16G 自动 channel/QP 原始报告
|
- `reports_multinode_nccl_16g_2x8_nccl227_auto.md`:NCCL 2.27.7 16G 自动 channel/QP 原始报告
|
||||||
|
- `reports_multinode_nccl_pdf_matrix_nccl227.md`:NCCL 2.27.7 PDF 矩阵式原始报告
|
||||||
- `reports_multinode_nccl_diagnosis_20260523.md`:本中文诊断总结
|
- `reports_multinode_nccl_diagnosis_20260523.md`:本中文诊断总结
|
||||||
|
|||||||
83
reports_multinode_nccl_pdf_matrix_nccl227.md
Normal file
83
reports_multinode_nccl_pdf_matrix_nccl227.md
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
# GPU Test Report
|
||||||
|
|
||||||
|
- **Date:** 2026-05-23T08:32:58.113416
|
||||||
|
- **Host:** aikubeworker0012
|
||||||
|
|
||||||
|
## Overall Acceptance Verdict
|
||||||
|
|
||||||
|
**Result: FAIL**
|
||||||
|
|
||||||
|
Missing required evidence:
|
||||||
|
- GPU Info
|
||||||
|
- Health Check
|
||||||
|
- Memory Bandwidth
|
||||||
|
- Compute Throughput
|
||||||
|
- NVLink/NVSwitch
|
||||||
|
- NCCL
|
||||||
|
- Stress Test
|
||||||
|
- RDMA
|
||||||
|
- DCGM
|
||||||
|
- Training
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
| Test | Result |
|
||||||
|
|------|--------|
|
||||||
|
| Multi-node NCCL | FAIL |
|
||||||
|
|
||||||
|
## Multi-node NCCL / Cross Leaf
|
||||||
|
|
||||||
|
Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
|
||||||
|
|
||||||
|
- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
|
||||||
|
- **Preflight:** PASS
|
||||||
|
|
||||||
|
### Multi-node NCCL allreduce
|
||||||
|
|
||||||
|
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|
||||||
|
|----------|----------------------|-------------|-----------|------------|-----------|--------|
|
||||||
|
| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 47.23 GB/s | 16G | 47.24 GB/s | >= 49 GB/s | FAIL |
|
||||||
|
| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 136.97 GB/s | 16G | 137.17 GB/s | >= 137 GB/s | PASS |
|
||||||
|
| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 333.22 GB/s | 16G | 333.24 GB/s | >= 335 GB/s | FAIL |
|
||||||
|
| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 354.02 GB/s | 16G | 353.92 GB/s | >= 492 GB/s | FAIL |
|
||||||
|
|
||||||
|
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|
||||||
|
|----------|--------------|-----------------|------------------|-------------------|
|
||||||
|
| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
|
||||||
|
| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
|
||||||
|
| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
|
||||||
|
| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
|
||||||
|
|
||||||
|
| Topology | Return Code | Error / Output Tail |
|
||||||
|
|----------|-------------|---------------------|
|
||||||
|
| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | E aikubeworker0012:2157248:2157325 [0] NCCL INFO comm 0x5595f28bf420 rank 0 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth : 47.2399 # # Collective test concluded: all_reduce_perf # |
|
||||||
|
| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | ker0012:2157429:2157526 [3] NCCL INFO comm 0x55a8a0147090 rank 3 nranks 8 cudaDev 3 busId ab000 - Destroy COMPLETE aikubeworker0012:2157427:2157524 [1] NCCL INFO comm 0x55b1b0f86630 rank 1 nranks 8 cudaDev 1 busId 2a000 - Destroy COMPLETE |
|
||||||
|
| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 | aikubeworker0016:1138578:1139592 [0] NCCL INFO comm 0x556eff26c190 rank 8 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth : 353.915 # # Collective test concluded: all_reduce_perf # |
|
||||||
|
|
||||||
|
### Multi-node NCCL alltoall
|
||||||
|
|
||||||
|
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|
||||||
|
|----------|----------------------|-------------|-----------|------------|-----------|--------|
|
||||||
|
| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 24.84 GB/s | 16G | 24.89 GB/s | >= 27 GB/s | FAIL |
|
||||||
|
| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 47.67 GB/s | 16G | 47.91 GB/s | >= 54 GB/s | FAIL |
|
||||||
|
| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 72.93 GB/s | 16G | 72.97 GB/s | >= 74 GB/s | FAIL |
|
||||||
|
| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 30.04 GB/s | 16G | 30.04 GB/s | >= 77 GB/s | FAIL |
|
||||||
|
|
||||||
|
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|
||||||
|
|----------|--------------|-----------------|------------------|-------------------|
|
||||||
|
| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
|
||||||
|
| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
|
||||||
|
| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
|
||||||
|
| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
|
||||||
|
|
||||||
|
| Topology | Return Code | Error / Output Tail |
|
||||||
|
|----------|-------------|---------------------|
|
||||||
|
| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | ETE aikubeworker0012:2157727:2157802 [0] NCCL INFO comm 0x55a0349b02b0 rank 0 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth : 24.8897 # # Collective test concluded: alltoall_perf # |
|
||||||
|
| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | 0 | ETE aikubeworker0016:1141290:1142410 [0] NCCL INFO comm 0x55fabbea6410 rank 2 nranks 4 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth : 47.9094 # # Collective test concluded: alltoall_perf # |
|
||||||
|
| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | ETE aikubeworker0012:2158071:2158172 [0] NCCL INFO comm 0x563312baa7f0 rank 0 nranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth : 72.9657 # # Collective test concluded: alltoall_perf # |
|
||||||
|
| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 | 016:1143717:1145948 [7] NCCL INFO comm 0x5558cc9de640 rank 15 nranks 16 cudaDev 7 busId db000 - Destroy COMPLETE aikubeworker0016:1143713:1145946 [3] NCCL INFO comm 0x55c1af080e60 rank 11 nranks 16 cudaDev 3 busId 5d000 - Destroy COMPLETE |
|
||||||
|
|
||||||
|
**Overall: FAIL**
|
||||||
|
|
||||||
|
---
|
||||||
|
*Generated by GPU Test Suite v0.2.0*
|
||||||
Loading…
x
Reference in New Issue
Block a user