170 lines
5.1 KiB
Bash
170 lines
5.1 KiB
Bash
#!/usr/bin/env bash
|
||
set -euo pipefail
|
||
|
||
# Collect a lightweight NCCL/RDMA environment snapshot on one node.
|
||
# This script does not run NCCL workloads and is safe to use before deeper tests.
|
||
|
||
HOST="$(hostname 2>/dev/null || echo unknown)"
|
||
TS="$(date +%Y%m%d_%H%M%S)"
|
||
OUT_FILE="${1:-${OUT_FILE:-/tmp/nccl_environment_snapshot_${HOST}_${TS}.md}}"
|
||
PDF_ALLREDUCE_BUSBW="${PDF_ALLREDUCE_BUSBW:-491.84}"
|
||
PDF_ALLTOALL_BUSBW="${PDF_ALLTOALL_BUSBW:-76.54}"
|
||
PLUGIN_SEARCH_ROOTS="${PLUGIN_SEARCH_ROOTS:-/usr /opt /tmp /root}"
|
||
|
||
mkdir -p "$(dirname "$OUT_FILE")"
|
||
shopt -s nullglob
|
||
|
||
have_cmd() {
|
||
command -v "$1" >/dev/null 2>&1
|
||
}
|
||
|
||
emit_cmd() {
|
||
local title="$1"
|
||
shift
|
||
{
|
||
echo
|
||
echo "### $title"
|
||
echo
|
||
echo '```text'
|
||
"$@" 2>&1 || true
|
||
echo '```'
|
||
} >>"$OUT_FILE"
|
||
}
|
||
|
||
active_400g_hcas=()
|
||
non_400g_rows=()
|
||
|
||
{
|
||
echo "# NCCL/RDMA 环境快照"
|
||
echo
|
||
echo "- Host: \`$HOST\`"
|
||
echo "- Time: \`$(date -Is 2>/dev/null || date)\`"
|
||
echo "- Kernel: \`$(uname -r 2>/dev/null || echo unknown)\`"
|
||
echo
|
||
echo "## HCA / Port 状态"
|
||
echo
|
||
echo "| HCA | Port | State | Phys State | Rate | Link Layer | 400G IB Rail |"
|
||
echo "|---|---:|---|---|---:|---|---|"
|
||
} >"$OUT_FILE"
|
||
|
||
hca_paths=(/sys/class/infiniband/mlx5_*)
|
||
if ((${#hca_paths[@]})); then
|
||
for hca_path in "${hca_paths[@]}"; do
|
||
hca="$(basename "$hca_path")"
|
||
for port_path in "$hca_path"/ports/*; do
|
||
[[ -d "$port_path" ]] || continue
|
||
port="$(basename "$port_path")"
|
||
state="$(cat "$port_path/state" 2>/dev/null || echo NA)"
|
||
phys_state="$(cat "$port_path/phys_state" 2>/dev/null || echo NA)"
|
||
rate="$(cat "$port_path/rate" 2>/dev/null || echo NA)"
|
||
layer="$(cat "$port_path/link_layer" 2>/dev/null || echo NA)"
|
||
is_400g="NO"
|
||
if [[ "$state" == *"ACTIVE"* && "$rate" == 400\ Gb/sec* && "$layer" == "InfiniBand" ]]; then
|
||
is_400g="YES"
|
||
active_400g_hcas+=("$hca")
|
||
else
|
||
non_400g_rows+=("$hca port=$port state=$state rate=$rate layer=$layer")
|
||
fi
|
||
printf '| `%s` | `%s` | `%s` | `%s` | `%s` | `%s` | `%s` |\n' \
|
||
"$hca" "$port" "$state" "$phys_state" "$rate" "$layer" "$is_400g" >>"$OUT_FILE"
|
||
done
|
||
done
|
||
else
|
||
printf '| N/A | N/A | `%s` | N/A | N/A | N/A | NO |\n' "/sys/class/infiniband/mlx5_* not found" >>"$OUT_FILE"
|
||
fi
|
||
|
||
{
|
||
echo
|
||
echo "## Rail 摘要"
|
||
echo
|
||
if ((${#active_400g_hcas[@]})); then
|
||
hca_csv="$(IFS=,; echo "${active_400g_hcas[*]}")"
|
||
echo "- Active 400G IB rail count: \`${#active_400g_hcas[@]}\`"
|
||
echo "- Candidate \`NCCL_IB_HCA\`: \`$hca_csv\`"
|
||
echo "- Theoretical one-way raw bandwidth: \`${#active_400g_hcas[@]} * 400Gb/s / 8 = $((${#active_400g_hcas[@]} * 50)) GB/s\`"
|
||
else
|
||
echo "- Active 400G IB rail count: \`0\`"
|
||
echo "- Candidate \`NCCL_IB_HCA\`: \`N/A\`"
|
||
fi
|
||
echo
|
||
echo "Non-400G / non-IB / down ports:"
|
||
echo
|
||
if ((${#non_400g_rows[@]})); then
|
||
for row in "${non_400g_rows[@]}"; do
|
||
echo "- \`$row\`"
|
||
done
|
||
else
|
||
echo "- none"
|
||
fi
|
||
echo
|
||
echo "## PDF 目标换算"
|
||
echo
|
||
echo "- PDF allreduce busbw target: \`${PDF_ALLREDUCE_BUSBW} GB/s\`"
|
||
echo "- PDF alltoall busbw target: \`${PDF_ALLTOALL_BUSBW} GB/s\`"
|
||
} >>"$OUT_FILE"
|
||
|
||
python3 - "$PDF_ALLREDUCE_BUSBW" "${#active_400g_hcas[@]}" >>"$OUT_FILE" <<'PY' || true
|
||
import sys
|
||
|
||
busbw = float(sys.argv[1])
|
||
rail_count = int(sys.argv[2])
|
||
algbw = busbw / 1.875
|
||
raw = rail_count * 50.0
|
||
print(f"- 16-rank allreduce implied algbw: `{algbw:.2f} GB/s`")
|
||
if rail_count:
|
||
pct = algbw / raw * 100
|
||
print(f"- Implied algbw / current raw 400G rail bandwidth: `{pct:.1f}%`")
|
||
if algbw > raw:
|
||
print("- Interpretation: PDF allreduce target is above current 400G rail one-way raw bandwidth.")
|
||
else:
|
||
print("- Interpretation: PDF allreduce target is within current 400G rail one-way raw bandwidth.")
|
||
else:
|
||
print("- Interpretation: no active 400G IB rail was detected.")
|
||
PY
|
||
|
||
{
|
||
echo
|
||
echo "## NCCL Net Plugin / SHARP 文件"
|
||
echo
|
||
echo '```text'
|
||
} >>"$OUT_FILE"
|
||
|
||
read -r -a plugin_roots <<<"$PLUGIN_SEARCH_ROOTS"
|
||
find "${plugin_roots[@]}" \( -name 'libnccl-net*.so*' -o -name 'libsharp*.so*' \) \
|
||
2>/dev/null | sort >>"$OUT_FILE" || true
|
||
|
||
if ! grep -q 'libnccl-net\|libsharp' "$OUT_FILE"; then
|
||
echo "none found under $PLUGIN_SEARCH_ROOTS" >>"$OUT_FILE"
|
||
fi
|
||
|
||
echo '```' >>"$OUT_FILE"
|
||
|
||
if have_cmd dpkg; then
|
||
emit_cmd "Relevant Debian packages" bash -lc "dpkg -l | egrep -i 'nccl|sharp|hcoll|ucx|ofed|mlnx' | sed -n '1,160p'"
|
||
else
|
||
emit_cmd "Relevant packages" bash -lc "echo 'dpkg not found'"
|
||
fi
|
||
|
||
if have_cmd nvidia-smi; then
|
||
emit_cmd "nvidia-smi topo -m" nvidia-smi topo -m
|
||
else
|
||
emit_cmd "nvidia-smi topo -m" bash -lc "echo 'nvidia-smi not found'"
|
||
fi
|
||
|
||
if have_cmd ibstat; then
|
||
emit_cmd "ibstat" ibstat
|
||
fi
|
||
|
||
{
|
||
echo
|
||
echo "## 建议判断"
|
||
echo
|
||
echo "1. 如果 Active 400G IB rail 少于 PDF 参考环境,不能直接按 PDF 阈值判断等价。"
|
||
echo "2. 如果没有 \`libnccl-net*.so*\` / \`libsharp*.so*\`,NCCL 可能只能走 internal IB plugin。"
|
||
echo "3. 若要追 PDF 2x8 目标,请先确认 rail 数量、SHARP/NCCL net plugin、跨 Leaf 交换策略是否与 PDF 环境一致。"
|
||
echo
|
||
echo "Snapshot written to: \`$OUT_FILE\`"
|
||
} >>"$OUT_FILE"
|
||
|
||
echo "$OUT_FILE"
|