Add NCCL environment snapshot script
This commit is contained in:
parent
ec6b868d3f
commit
b6b1ccc2dc
@ -105,6 +105,13 @@ cd /root/test_gpu_scripts
|
||||
bash scripts/multinode_nccl_deep_diagnose.sh preflight
|
||||
```
|
||||
|
||||
### 单节点环境等价性快照
|
||||
|
||||
```bash
|
||||
cd /root/test_gpu_scripts
|
||||
bash scripts/nccl_environment_snapshot.sh reports/nccl_environment_snapshot_$(hostname)_$(date +%Y%m%d_%H%M%S).md
|
||||
```
|
||||
|
||||
### 完整深度诊断
|
||||
|
||||
```bash
|
||||
@ -139,6 +146,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
|
||||
| `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall 参数 sweep 和结论 |
|
||||
| `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本 runbook |
|
||||
| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 |
|
||||
| `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 |
|
||||
|
||||
## 当前建议
|
||||
|
||||
|
||||
@ -26,6 +26,7 @@
|
||||
| 文件 | 用途 |
|
||||
|---|---|
|
||||
| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 |
|
||||
| `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本,不启动 NCCL workload |
|
||||
| `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |
|
||||
|
||||
推荐先跑轻量检查:
|
||||
@ -35,6 +36,13 @@ cd /root/test_gpu_scripts
|
||||
bash scripts/multinode_nccl_deep_diagnose.sh preflight
|
||||
```
|
||||
|
||||
采集单节点环境快照:
|
||||
|
||||
```bash
|
||||
cd /root/test_gpu_scripts
|
||||
bash scripts/nccl_environment_snapshot.sh reports/nccl_environment_snapshot_$(hostname)_$(date +%Y%m%d_%H%M%S).md
|
||||
```
|
||||
|
||||
完整复跑:
|
||||
|
||||
```bash
|
||||
@ -75,6 +83,13 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
|
||||
- `graph/`
|
||||
- `pxn_sweep/`
|
||||
|
||||
最新单节点环境快照:
|
||||
|
||||
```text
|
||||
aikubeworker0012: /root/test_gpu_scripts/reports/nccl_environment_snapshot_aikubeworker0012_20260523_111142.md
|
||||
aikubeworker0016: /root/test_gpu_scripts/reports/nccl_environment_snapshot_aikubeworker0016_20260523_111143.md
|
||||
```
|
||||
|
||||
## 当前证据摘要
|
||||
|
||||
### HCA / rail
|
||||
|
||||
169
scripts/nccl_environment_snapshot.sh
Normal file
169
scripts/nccl_environment_snapshot.sh
Normal file
@ -0,0 +1,169 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Collect a lightweight NCCL/RDMA environment snapshot on one node.
|
||||
# This script does not run NCCL workloads and is safe to use before deeper tests.
|
||||
|
||||
HOST="$(hostname 2>/dev/null || echo unknown)"
|
||||
TS="$(date +%Y%m%d_%H%M%S)"
|
||||
OUT_FILE="${1:-${OUT_FILE:-/tmp/nccl_environment_snapshot_${HOST}_${TS}.md}}"
|
||||
PDF_ALLREDUCE_BUSBW="${PDF_ALLREDUCE_BUSBW:-491.84}"
|
||||
PDF_ALLTOALL_BUSBW="${PDF_ALLTOALL_BUSBW:-76.54}"
|
||||
PLUGIN_SEARCH_ROOTS="${PLUGIN_SEARCH_ROOTS:-/usr /opt /tmp /root}"
|
||||
|
||||
mkdir -p "$(dirname "$OUT_FILE")"
|
||||
shopt -s nullglob
|
||||
|
||||
have_cmd() {
|
||||
command -v "$1" >/dev/null 2>&1
|
||||
}
|
||||
|
||||
emit_cmd() {
|
||||
local title="$1"
|
||||
shift
|
||||
{
|
||||
echo
|
||||
echo "### $title"
|
||||
echo
|
||||
echo '```text'
|
||||
"$@" 2>&1 || true
|
||||
echo '```'
|
||||
} >>"$OUT_FILE"
|
||||
}
|
||||
|
||||
active_400g_hcas=()
|
||||
non_400g_rows=()
|
||||
|
||||
{
|
||||
echo "# NCCL/RDMA 环境快照"
|
||||
echo
|
||||
echo "- Host: \`$HOST\`"
|
||||
echo "- Time: \`$(date -Is 2>/dev/null || date)\`"
|
||||
echo "- Kernel: \`$(uname -r 2>/dev/null || echo unknown)\`"
|
||||
echo
|
||||
echo "## HCA / Port 状态"
|
||||
echo
|
||||
echo "| HCA | Port | State | Phys State | Rate | Link Layer | 400G IB Rail |"
|
||||
echo "|---|---:|---|---|---:|---|---|"
|
||||
} >"$OUT_FILE"
|
||||
|
||||
hca_paths=(/sys/class/infiniband/mlx5_*)
|
||||
if ((${#hca_paths[@]})); then
|
||||
for hca_path in "${hca_paths[@]}"; do
|
||||
hca="$(basename "$hca_path")"
|
||||
for port_path in "$hca_path"/ports/*; do
|
||||
[[ -d "$port_path" ]] || continue
|
||||
port="$(basename "$port_path")"
|
||||
state="$(cat "$port_path/state" 2>/dev/null || echo NA)"
|
||||
phys_state="$(cat "$port_path/phys_state" 2>/dev/null || echo NA)"
|
||||
rate="$(cat "$port_path/rate" 2>/dev/null || echo NA)"
|
||||
layer="$(cat "$port_path/link_layer" 2>/dev/null || echo NA)"
|
||||
is_400g="NO"
|
||||
if [[ "$state" == *"ACTIVE"* && "$rate" == 400\ Gb/sec* && "$layer" == "InfiniBand" ]]; then
|
||||
is_400g="YES"
|
||||
active_400g_hcas+=("$hca")
|
||||
else
|
||||
non_400g_rows+=("$hca port=$port state=$state rate=$rate layer=$layer")
|
||||
fi
|
||||
printf '| `%s` | `%s` | `%s` | `%s` | `%s` | `%s` | `%s` |\n' \
|
||||
"$hca" "$port" "$state" "$phys_state" "$rate" "$layer" "$is_400g" >>"$OUT_FILE"
|
||||
done
|
||||
done
|
||||
else
|
||||
printf '| N/A | N/A | `%s` | N/A | N/A | N/A | NO |\n' "/sys/class/infiniband/mlx5_* not found" >>"$OUT_FILE"
|
||||
fi
|
||||
|
||||
{
|
||||
echo
|
||||
echo "## Rail 摘要"
|
||||
echo
|
||||
if ((${#active_400g_hcas[@]})); then
|
||||
hca_csv="$(IFS=,; echo "${active_400g_hcas[*]}")"
|
||||
echo "- Active 400G IB rail count: \`${#active_400g_hcas[@]}\`"
|
||||
echo "- Candidate \`NCCL_IB_HCA\`: \`$hca_csv\`"
|
||||
echo "- Theoretical one-way raw bandwidth: \`${#active_400g_hcas[@]} * 400Gb/s / 8 = $((${#active_400g_hcas[@]} * 50)) GB/s\`"
|
||||
else
|
||||
echo "- Active 400G IB rail count: \`0\`"
|
||||
echo "- Candidate \`NCCL_IB_HCA\`: \`N/A\`"
|
||||
fi
|
||||
echo
|
||||
echo "Non-400G / non-IB / down ports:"
|
||||
echo
|
||||
if ((${#non_400g_rows[@]})); then
|
||||
for row in "${non_400g_rows[@]}"; do
|
||||
echo "- \`$row\`"
|
||||
done
|
||||
else
|
||||
echo "- none"
|
||||
fi
|
||||
echo
|
||||
echo "## PDF 目标换算"
|
||||
echo
|
||||
echo "- PDF allreduce busbw target: \`${PDF_ALLREDUCE_BUSBW} GB/s\`"
|
||||
echo "- PDF alltoall busbw target: \`${PDF_ALLTOALL_BUSBW} GB/s\`"
|
||||
} >>"$OUT_FILE"
|
||||
|
||||
python3 - "$PDF_ALLREDUCE_BUSBW" "${#active_400g_hcas[@]}" >>"$OUT_FILE" <<'PY' || true
|
||||
import sys
|
||||
|
||||
busbw = float(sys.argv[1])
|
||||
rail_count = int(sys.argv[2])
|
||||
algbw = busbw / 1.875
|
||||
raw = rail_count * 50.0
|
||||
print(f"- 16-rank allreduce implied algbw: `{algbw:.2f} GB/s`")
|
||||
if rail_count:
|
||||
pct = algbw / raw * 100
|
||||
print(f"- Implied algbw / current raw 400G rail bandwidth: `{pct:.1f}%`")
|
||||
if algbw > raw:
|
||||
print("- Interpretation: PDF allreduce target is above current 400G rail one-way raw bandwidth.")
|
||||
else:
|
||||
print("- Interpretation: PDF allreduce target is within current 400G rail one-way raw bandwidth.")
|
||||
else:
|
||||
print("- Interpretation: no active 400G IB rail was detected.")
|
||||
PY
|
||||
|
||||
{
|
||||
echo
|
||||
echo "## NCCL Net Plugin / SHARP 文件"
|
||||
echo
|
||||
echo '```text'
|
||||
} >>"$OUT_FILE"
|
||||
|
||||
read -r -a plugin_roots <<<"$PLUGIN_SEARCH_ROOTS"
|
||||
find "${plugin_roots[@]}" \( -name 'libnccl-net*.so*' -o -name 'libsharp*.so*' \) \
|
||||
2>/dev/null | sort >>"$OUT_FILE" || true
|
||||
|
||||
if ! grep -q 'libnccl-net\|libsharp' "$OUT_FILE"; then
|
||||
echo "none found under $PLUGIN_SEARCH_ROOTS" >>"$OUT_FILE"
|
||||
fi
|
||||
|
||||
echo '```' >>"$OUT_FILE"
|
||||
|
||||
if have_cmd dpkg; then
|
||||
emit_cmd "Relevant Debian packages" bash -lc "dpkg -l | egrep -i 'nccl|sharp|hcoll|ucx|ofed|mlnx' | sed -n '1,160p'"
|
||||
else
|
||||
emit_cmd "Relevant packages" bash -lc "echo 'dpkg not found'"
|
||||
fi
|
||||
|
||||
if have_cmd nvidia-smi; then
|
||||
emit_cmd "nvidia-smi topo -m" nvidia-smi topo -m
|
||||
else
|
||||
emit_cmd "nvidia-smi topo -m" bash -lc "echo 'nvidia-smi not found'"
|
||||
fi
|
||||
|
||||
if have_cmd ibstat; then
|
||||
emit_cmd "ibstat" ibstat
|
||||
fi
|
||||
|
||||
{
|
||||
echo
|
||||
echo "## 建议判断"
|
||||
echo
|
||||
echo "1. 如果 Active 400G IB rail 少于 PDF 参考环境,不能直接按 PDF 阈值判断等价。"
|
||||
echo "2. 如果没有 \`libnccl-net*.so*\` / \`libsharp*.so*\`,NCCL 可能只能走 internal IB plugin。"
|
||||
echo "3. 若要追 PDF 2x8 目标,请先确认 rail 数量、SHARP/NCCL net plugin、跨 Leaf 交换策略是否与 PDF 环境一致。"
|
||||
echo
|
||||
echo "Snapshot written to: \`$OUT_FILE\`"
|
||||
} >>"$OUT_FILE"
|
||||
|
||||
echo "$OUT_FILE"
|
||||
Loading…
x
Reference in New Issue
Block a user