#!/usr/bin/env bash set -euo pipefail # Collect a lightweight NCCL/RDMA environment snapshot on one node. # This script does not run NCCL workloads and is safe to use before deeper tests. HOST="$(hostname 2>/dev/null || echo unknown)" TS="$(date +%Y%m%d_%H%M%S)" OUT_FILE="${1:-${OUT_FILE:-/tmp/nccl_environment_snapshot_${HOST}_${TS}.md}}" PDF_ALLREDUCE_BUSBW="${PDF_ALLREDUCE_BUSBW:-491.84}" PDF_ALLTOALL_BUSBW="${PDF_ALLTOALL_BUSBW:-76.54}" PLUGIN_SEARCH_ROOTS="${PLUGIN_SEARCH_ROOTS:-/usr /opt /tmp /root}" mkdir -p "$(dirname "$OUT_FILE")" shopt -s nullglob have_cmd() { command -v "$1" >/dev/null 2>&1 } emit_cmd() { local title="$1" shift { echo echo "### $title" echo echo '```text' "$@" 2>&1 || true echo '```' } >>"$OUT_FILE" } active_400g_hcas=() non_400g_rows=() { echo "# NCCL/RDMA 环境快照" echo echo "- Host: \`$HOST\`" echo "- Time: \`$(date -Is 2>/dev/null || date)\`" echo "- Kernel: \`$(uname -r 2>/dev/null || echo unknown)\`" echo echo "## HCA / Port 状态" echo echo "| HCA | Port | State | Phys State | Rate | Link Layer | 400G IB Rail |" echo "|---|---:|---|---|---:|---|---|" } >"$OUT_FILE" hca_paths=(/sys/class/infiniband/mlx5_*) if ((${#hca_paths[@]})); then for hca_path in "${hca_paths[@]}"; do hca="$(basename "$hca_path")" for port_path in "$hca_path"/ports/*; do [[ -d "$port_path" ]] || continue port="$(basename "$port_path")" state="$(cat "$port_path/state" 2>/dev/null || echo NA)" phys_state="$(cat "$port_path/phys_state" 2>/dev/null || echo NA)" rate="$(cat "$port_path/rate" 2>/dev/null || echo NA)" layer="$(cat "$port_path/link_layer" 2>/dev/null || echo NA)" is_400g="NO" if [[ "$state" == *"ACTIVE"* && "$rate" == 400\ Gb/sec* && "$layer" == "InfiniBand" ]]; then is_400g="YES" active_400g_hcas+=("$hca") else non_400g_rows+=("$hca port=$port state=$state rate=$rate layer=$layer") fi printf '| `%s` | `%s` | `%s` | `%s` | `%s` | `%s` | `%s` |\n' \ "$hca" "$port" "$state" "$phys_state" "$rate" "$layer" "$is_400g" >>"$OUT_FILE" done done else printf '| N/A | N/A | `%s` | N/A | N/A | N/A | NO |\n' "/sys/class/infiniband/mlx5_* not found" >>"$OUT_FILE" fi { echo echo "## Rail 摘要" echo if ((${#active_400g_hcas[@]})); then hca_csv="$(IFS=,; echo "${active_400g_hcas[*]}")" echo "- Active 400G IB rail count: \`${#active_400g_hcas[@]}\`" echo "- Candidate \`NCCL_IB_HCA\`: \`$hca_csv\`" echo "- Theoretical one-way raw bandwidth: \`${#active_400g_hcas[@]} * 400Gb/s / 8 = $((${#active_400g_hcas[@]} * 50)) GB/s\`" else echo "- Active 400G IB rail count: \`0\`" echo "- Candidate \`NCCL_IB_HCA\`: \`N/A\`" fi echo echo "Non-400G / non-IB / down ports:" echo if ((${#non_400g_rows[@]})); then for row in "${non_400g_rows[@]}"; do echo "- \`$row\`" done else echo "- none" fi echo echo "## PDF 目标换算" echo echo "- PDF allreduce busbw target: \`${PDF_ALLREDUCE_BUSBW} GB/s\`" echo "- PDF alltoall busbw target: \`${PDF_ALLTOALL_BUSBW} GB/s\`" } >>"$OUT_FILE" python3 - "$PDF_ALLREDUCE_BUSBW" "${#active_400g_hcas[@]}" >>"$OUT_FILE" <<'PY' || true import sys busbw = float(sys.argv[1]) rail_count = int(sys.argv[2]) algbw = busbw / 1.875 raw = rail_count * 50.0 print(f"- 16-rank allreduce implied algbw: `{algbw:.2f} GB/s`") if rail_count: pct = algbw / raw * 100 print(f"- Implied algbw / current raw 400G rail bandwidth: `{pct:.1f}%`") if algbw > raw: print("- Interpretation: PDF allreduce target is above current 400G rail one-way raw bandwidth.") else: print("- Interpretation: PDF allreduce target is within current 400G rail one-way raw bandwidth.") else: print("- Interpretation: no active 400G IB rail was detected.") PY { echo echo "## NCCL Net Plugin / SHARP 文件" echo echo '```text' } >>"$OUT_FILE" read -r -a plugin_roots <<<"$PLUGIN_SEARCH_ROOTS" find "${plugin_roots[@]}" \( -name 'libnccl-net*.so*' -o -name 'libsharp*.so*' \) \ 2>/dev/null | sort >>"$OUT_FILE" || true if ! grep -q 'libnccl-net\|libsharp' "$OUT_FILE"; then echo "none found under $PLUGIN_SEARCH_ROOTS" >>"$OUT_FILE" fi echo '```' >>"$OUT_FILE" if have_cmd dpkg; then emit_cmd "Relevant Debian packages" bash -lc "dpkg -l | egrep -i 'nccl|sharp|hcoll|ucx|ofed|mlnx' | sed -n '1,160p'" else emit_cmd "Relevant packages" bash -lc "echo 'dpkg not found'" fi if have_cmd nvidia-smi; then emit_cmd "nvidia-smi topo -m" nvidia-smi topo -m else emit_cmd "nvidia-smi topo -m" bash -lc "echo 'nvidia-smi not found'" fi if have_cmd ibstat; then emit_cmd "ibstat" ibstat fi { echo echo "## 建议判断" echo echo "1. 如果 Active 400G IB rail 少于 PDF 参考环境,不能直接按 PDF 阈值判断等价。" echo "2. 如果没有 \`libnccl-net*.so*\` / \`libsharp*.so*\`,NCCL 可能只能走 internal IB plugin。" echo "3. 若要追 PDF 2x8 目标,请先确认 rail 数量、SHARP/NCCL net plugin、跨 Leaf 交换策略是否与 PDF 环境一致。" echo echo "Snapshot written to: \`$OUT_FILE\`" } >>"$OUT_FILE" echo "$OUT_FILE"