Add NCCL environment snapshot script

This commit is contained in:
cs 2026-05-23 19:13:35 +08:00
parent ec6b868d3f
commit b6b1ccc2dc
3 changed files with 192 additions and 0 deletions

View File

@ -105,6 +105,13 @@ cd /root/test_gpu_scripts
bash scripts/multinode_nccl_deep_diagnose.sh preflight
```
### 单节点环境等价性快照
```bash
cd /root/test_gpu_scripts
bash scripts/nccl_environment_snapshot.sh reports/nccl_environment_snapshot_$(hostname)_$(date +%Y%m%d_%H%M%S).md
```
### 完整深度诊断
```bash
@ -139,6 +146,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
| `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall 参数 sweep 和结论 |
| `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本 runbook |
| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 |
| `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 |
## 当前建议

View File

@ -26,6 +26,7 @@
| 文件 | 用途 |
|---|---|
| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 |
| `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本,不启动 NCCL workload |
| `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |
推荐先跑轻量检查:
@ -35,6 +36,13 @@ cd /root/test_gpu_scripts
bash scripts/multinode_nccl_deep_diagnose.sh preflight
```
采集单节点环境快照:
```bash
cd /root/test_gpu_scripts
bash scripts/nccl_environment_snapshot.sh reports/nccl_environment_snapshot_$(hostname)_$(date +%Y%m%d_%H%M%S).md
```
完整复跑:
```bash
@ -75,6 +83,13 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
- `graph/`
- `pxn_sweep/`
最新单节点环境快照:
```text
aikubeworker0012: /root/test_gpu_scripts/reports/nccl_environment_snapshot_aikubeworker0012_20260523_111142.md
aikubeworker0016: /root/test_gpu_scripts/reports/nccl_environment_snapshot_aikubeworker0016_20260523_111143.md
```
## 当前证据摘要
### HCA / rail

View File

@ -0,0 +1,169 @@
#!/usr/bin/env bash
set -euo pipefail
# Collect a lightweight NCCL/RDMA environment snapshot on one node.
# This script does not run NCCL workloads and is safe to use before deeper tests.
HOST="$(hostname 2>/dev/null || echo unknown)"
TS="$(date +%Y%m%d_%H%M%S)"
OUT_FILE="${1:-${OUT_FILE:-/tmp/nccl_environment_snapshot_${HOST}_${TS}.md}}"
PDF_ALLREDUCE_BUSBW="${PDF_ALLREDUCE_BUSBW:-491.84}"
PDF_ALLTOALL_BUSBW="${PDF_ALLTOALL_BUSBW:-76.54}"
PLUGIN_SEARCH_ROOTS="${PLUGIN_SEARCH_ROOTS:-/usr /opt /tmp /root}"
mkdir -p "$(dirname "$OUT_FILE")"
shopt -s nullglob
have_cmd() {
command -v "$1" >/dev/null 2>&1
}
emit_cmd() {
local title="$1"
shift
{
echo
echo "### $title"
echo
echo '```text'
"$@" 2>&1 || true
echo '```'
} >>"$OUT_FILE"
}
active_400g_hcas=()
non_400g_rows=()
{
echo "# NCCL/RDMA 环境快照"
echo
echo "- Host: \`$HOST\`"
echo "- Time: \`$(date -Is 2>/dev/null || date)\`"
echo "- Kernel: \`$(uname -r 2>/dev/null || echo unknown)\`"
echo
echo "## HCA / Port 状态"
echo
echo "| HCA | Port | State | Phys State | Rate | Link Layer | 400G IB Rail |"
echo "|---|---:|---|---|---:|---|---|"
} >"$OUT_FILE"
hca_paths=(/sys/class/infiniband/mlx5_*)
if ((${#hca_paths[@]})); then
for hca_path in "${hca_paths[@]}"; do
hca="$(basename "$hca_path")"
for port_path in "$hca_path"/ports/*; do
[[ -d "$port_path" ]] || continue
port="$(basename "$port_path")"
state="$(cat "$port_path/state" 2>/dev/null || echo NA)"
phys_state="$(cat "$port_path/phys_state" 2>/dev/null || echo NA)"
rate="$(cat "$port_path/rate" 2>/dev/null || echo NA)"
layer="$(cat "$port_path/link_layer" 2>/dev/null || echo NA)"
is_400g="NO"
if [[ "$state" == *"ACTIVE"* && "$rate" == 400\ Gb/sec* && "$layer" == "InfiniBand" ]]; then
is_400g="YES"
active_400g_hcas+=("$hca")
else
non_400g_rows+=("$hca port=$port state=$state rate=$rate layer=$layer")
fi
printf '| `%s` | `%s` | `%s` | `%s` | `%s` | `%s` | `%s` |\n' \
"$hca" "$port" "$state" "$phys_state" "$rate" "$layer" "$is_400g" >>"$OUT_FILE"
done
done
else
printf '| N/A | N/A | `%s` | N/A | N/A | N/A | NO |\n' "/sys/class/infiniband/mlx5_* not found" >>"$OUT_FILE"
fi
{
echo
echo "## Rail 摘要"
echo
if ((${#active_400g_hcas[@]})); then
hca_csv="$(IFS=,; echo "${active_400g_hcas[*]}")"
echo "- Active 400G IB rail count: \`${#active_400g_hcas[@]}\`"
echo "- Candidate \`NCCL_IB_HCA\`: \`$hca_csv\`"
echo "- Theoretical one-way raw bandwidth: \`${#active_400g_hcas[@]} * 400Gb/s / 8 = $((${#active_400g_hcas[@]} * 50)) GB/s\`"
else
echo "- Active 400G IB rail count: \`0\`"
echo "- Candidate \`NCCL_IB_HCA\`: \`N/A\`"
fi
echo
echo "Non-400G / non-IB / down ports:"
echo
if ((${#non_400g_rows[@]})); then
for row in "${non_400g_rows[@]}"; do
echo "- \`$row\`"
done
else
echo "- none"
fi
echo
echo "## PDF 目标换算"
echo
echo "- PDF allreduce busbw target: \`${PDF_ALLREDUCE_BUSBW} GB/s\`"
echo "- PDF alltoall busbw target: \`${PDF_ALLTOALL_BUSBW} GB/s\`"
} >>"$OUT_FILE"
python3 - "$PDF_ALLREDUCE_BUSBW" "${#active_400g_hcas[@]}" >>"$OUT_FILE" <<'PY' || true
import sys
busbw = float(sys.argv[1])
rail_count = int(sys.argv[2])
algbw = busbw / 1.875
raw = rail_count * 50.0
print(f"- 16-rank allreduce implied algbw: `{algbw:.2f} GB/s`")
if rail_count:
pct = algbw / raw * 100
print(f"- Implied algbw / current raw 400G rail bandwidth: `{pct:.1f}%`")
if algbw > raw:
print("- Interpretation: PDF allreduce target is above current 400G rail one-way raw bandwidth.")
else:
print("- Interpretation: PDF allreduce target is within current 400G rail one-way raw bandwidth.")
else:
print("- Interpretation: no active 400G IB rail was detected.")
PY
{
echo
echo "## NCCL Net Plugin / SHARP 文件"
echo
echo '```text'
} >>"$OUT_FILE"
read -r -a plugin_roots <<<"$PLUGIN_SEARCH_ROOTS"
find "${plugin_roots[@]}" \( -name 'libnccl-net*.so*' -o -name 'libsharp*.so*' \) \
2>/dev/null | sort >>"$OUT_FILE" || true
if ! grep -q 'libnccl-net\|libsharp' "$OUT_FILE"; then
echo "none found under $PLUGIN_SEARCH_ROOTS" >>"$OUT_FILE"
fi
echo '```' >>"$OUT_FILE"
if have_cmd dpkg; then
emit_cmd "Relevant Debian packages" bash -lc "dpkg -l | egrep -i 'nccl|sharp|hcoll|ucx|ofed|mlnx' | sed -n '1,160p'"
else
emit_cmd "Relevant packages" bash -lc "echo 'dpkg not found'"
fi
if have_cmd nvidia-smi; then
emit_cmd "nvidia-smi topo -m" nvidia-smi topo -m
else
emit_cmd "nvidia-smi topo -m" bash -lc "echo 'nvidia-smi not found'"
fi
if have_cmd ibstat; then
emit_cmd "ibstat" ibstat
fi
{
echo
echo "## 建议判断"
echo
echo "1. 如果 Active 400G IB rail 少于 PDF 参考环境,不能直接按 PDF 阈值判断等价。"
echo "2. 如果没有 \`libnccl-net*.so*\` / \`libsharp*.so*\`NCCL 可能只能走 internal IB plugin。"
echo "3. 若要追 PDF 2x8 目标,请先确认 rail 数量、SHARP/NCCL net plugin、跨 Leaf 交换策略是否与 PDF 环境一致。"
echo
echo "Snapshot written to: \`$OUT_FILE\`"
} >>"$OUT_FILE"
echo "$OUT_FILE"