test_gpu_scripts/scripts/nccl_environment_snapshot.sh

170 lines
5.1 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
set -euo pipefail
# Collect a lightweight NCCL/RDMA environment snapshot on one node.
# This script does not run NCCL workloads and is safe to use before deeper tests.
HOST="$(hostname 2>/dev/null || echo unknown)"
TS="$(date +%Y%m%d_%H%M%S)"
OUT_FILE="${1:-${OUT_FILE:-/tmp/nccl_environment_snapshot_${HOST}_${TS}.md}}"
PDF_ALLREDUCE_BUSBW="${PDF_ALLREDUCE_BUSBW:-491.84}"
PDF_ALLTOALL_BUSBW="${PDF_ALLTOALL_BUSBW:-76.54}"
PLUGIN_SEARCH_ROOTS="${PLUGIN_SEARCH_ROOTS:-/usr /opt /tmp /root}"
mkdir -p "$(dirname "$OUT_FILE")"
shopt -s nullglob
have_cmd() {
command -v "$1" >/dev/null 2>&1
}
emit_cmd() {
local title="$1"
shift
{
echo
echo "### $title"
echo
echo '```text'
"$@" 2>&1 || true
echo '```'
} >>"$OUT_FILE"
}
active_400g_hcas=()
non_400g_rows=()
{
echo "# NCCL/RDMA 环境快照"
echo
echo "- Host: \`$HOST\`"
echo "- Time: \`$(date -Is 2>/dev/null || date)\`"
echo "- Kernel: \`$(uname -r 2>/dev/null || echo unknown)\`"
echo
echo "## HCA / Port 状态"
echo
echo "| HCA | Port | State | Phys State | Rate | Link Layer | 400G IB Rail |"
echo "|---|---:|---|---|---:|---|---|"
} >"$OUT_FILE"
hca_paths=(/sys/class/infiniband/mlx5_*)
if ((${#hca_paths[@]})); then
for hca_path in "${hca_paths[@]}"; do
hca="$(basename "$hca_path")"
for port_path in "$hca_path"/ports/*; do
[[ -d "$port_path" ]] || continue
port="$(basename "$port_path")"
state="$(cat "$port_path/state" 2>/dev/null || echo NA)"
phys_state="$(cat "$port_path/phys_state" 2>/dev/null || echo NA)"
rate="$(cat "$port_path/rate" 2>/dev/null || echo NA)"
layer="$(cat "$port_path/link_layer" 2>/dev/null || echo NA)"
is_400g="NO"
if [[ "$state" == *"ACTIVE"* && "$rate" == 400\ Gb/sec* && "$layer" == "InfiniBand" ]]; then
is_400g="YES"
active_400g_hcas+=("$hca")
else
non_400g_rows+=("$hca port=$port state=$state rate=$rate layer=$layer")
fi
printf '| `%s` | `%s` | `%s` | `%s` | `%s` | `%s` | `%s` |\n' \
"$hca" "$port" "$state" "$phys_state" "$rate" "$layer" "$is_400g" >>"$OUT_FILE"
done
done
else
printf '| N/A | N/A | `%s` | N/A | N/A | N/A | NO |\n' "/sys/class/infiniband/mlx5_* not found" >>"$OUT_FILE"
fi
{
echo
echo "## Rail 摘要"
echo
if ((${#active_400g_hcas[@]})); then
hca_csv="$(IFS=,; echo "${active_400g_hcas[*]}")"
echo "- Active 400G IB rail count: \`${#active_400g_hcas[@]}\`"
echo "- Candidate \`NCCL_IB_HCA\`: \`$hca_csv\`"
echo "- Theoretical one-way raw bandwidth: \`${#active_400g_hcas[@]} * 400Gb/s / 8 = $((${#active_400g_hcas[@]} * 50)) GB/s\`"
else
echo "- Active 400G IB rail count: \`0\`"
echo "- Candidate \`NCCL_IB_HCA\`: \`N/A\`"
fi
echo
echo "Non-400G / non-IB / down ports:"
echo
if ((${#non_400g_rows[@]})); then
for row in "${non_400g_rows[@]}"; do
echo "- \`$row\`"
done
else
echo "- none"
fi
echo
echo "## PDF 目标换算"
echo
echo "- PDF allreduce busbw target: \`${PDF_ALLREDUCE_BUSBW} GB/s\`"
echo "- PDF alltoall busbw target: \`${PDF_ALLTOALL_BUSBW} GB/s\`"
} >>"$OUT_FILE"
python3 - "$PDF_ALLREDUCE_BUSBW" "${#active_400g_hcas[@]}" >>"$OUT_FILE" <<'PY' || true
import sys
busbw = float(sys.argv[1])
rail_count = int(sys.argv[2])
algbw = busbw / 1.875
raw = rail_count * 50.0
print(f"- 16-rank allreduce implied algbw: `{algbw:.2f} GB/s`")
if rail_count:
pct = algbw / raw * 100
print(f"- Implied algbw / current raw 400G rail bandwidth: `{pct:.1f}%`")
if algbw > raw:
print("- Interpretation: PDF allreduce target is above current 400G rail one-way raw bandwidth.")
else:
print("- Interpretation: PDF allreduce target is within current 400G rail one-way raw bandwidth.")
else:
print("- Interpretation: no active 400G IB rail was detected.")
PY
{
echo
echo "## NCCL Net Plugin / SHARP 文件"
echo
echo '```text'
} >>"$OUT_FILE"
read -r -a plugin_roots <<<"$PLUGIN_SEARCH_ROOTS"
find "${plugin_roots[@]}" \( -name 'libnccl-net*.so*' -o -name 'libsharp*.so*' \) \
2>/dev/null | sort >>"$OUT_FILE" || true
if ! grep -q 'libnccl-net\|libsharp' "$OUT_FILE"; then
echo "none found under $PLUGIN_SEARCH_ROOTS" >>"$OUT_FILE"
fi
echo '```' >>"$OUT_FILE"
if have_cmd dpkg; then
emit_cmd "Relevant Debian packages" bash -lc "dpkg -l | egrep -i 'nccl|sharp|hcoll|ucx|ofed|mlnx' | sed -n '1,160p'"
else
emit_cmd "Relevant packages" bash -lc "echo 'dpkg not found'"
fi
if have_cmd nvidia-smi; then
emit_cmd "nvidia-smi topo -m" nvidia-smi topo -m
else
emit_cmd "nvidia-smi topo -m" bash -lc "echo 'nvidia-smi not found'"
fi
if have_cmd ibstat; then
emit_cmd "ibstat" ibstat
fi
{
echo
echo "## 建议判断"
echo
echo "1. 如果 Active 400G IB rail 少于 PDF 参考环境,不能直接按 PDF 阈值判断等价。"
echo "2. 如果没有 \`libnccl-net*.so*\` / \`libsharp*.so*\`NCCL 可能只能走 internal IB plugin。"
echo "3. 若要追 PDF 2x8 目标,请先确认 rail 数量、SHARP/NCCL net plugin、跨 Leaf 交换策略是否与 PDF 环境一致。"
echo
echo "Snapshot written to: \`$OUT_FILE\`"
} >>"$OUT_FILE"
echo "$OUT_FILE"