From cadfbcfaa3ec28f9bd3ec44768859f9a1a0ad2e2 Mon Sep 17 00:00:00 2001 From: cs Date: Sat, 23 May 2026 19:13:35 +0800 Subject: [PATCH] Add NCCL environment snapshot script --- ...ts_multinode_nccl_handoff_plan_20260523.md | 8 + ...ts_multinode_nccl_latest_index_20260523.md | 15 ++ scripts/nccl_environment_snapshot.sh | 169 ++++++++++++++++++ 3 files changed, 192 insertions(+) create mode 100644 scripts/nccl_environment_snapshot.sh diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md index b13496b..fb4e354 100644 --- a/reports_multinode_nccl_handoff_plan_20260523.md +++ b/reports_multinode_nccl_handoff_plan_20260523.md @@ -105,6 +105,13 @@ cd /root/test_gpu_scripts bash scripts/multinode_nccl_deep_diagnose.sh preflight ``` +### 单节点环境等价性快照 + +```bash +cd /root/test_gpu_scripts +bash scripts/nccl_environment_snapshot.sh reports/nccl_environment_snapshot_$(hostname)_$(date +%Y%m%d_%H%M%S).md +``` + ### 完整深度诊断 ```bash @@ -139,6 +146,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m% | `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall 参数 sweep 和结论 | | `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本 runbook | | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 | +| `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 | ## 当前建议 diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md index 94d17b5..4ccbc23 100644 --- a/reports_multinode_nccl_latest_index_20260523.md +++ b/reports_multinode_nccl_latest_index_20260523.md @@ -26,6 +26,7 @@ | 文件 | 用途 | |---|---| | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 | +| `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本,不启动 NCCL workload | | `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook | 推荐先跑轻量检查: @@ -35,6 +36,13 @@ cd /root/test_gpu_scripts bash scripts/multinode_nccl_deep_diagnose.sh preflight ``` +采集单节点环境快照: + +```bash +cd /root/test_gpu_scripts +bash scripts/nccl_environment_snapshot.sh reports/nccl_environment_snapshot_$(hostname)_$(date +%Y%m%d_%H%M%S).md +``` + 完整复跑: ```bash @@ -75,6 +83,13 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m% - `graph/` - `pxn_sweep/` +最新单节点环境快照: + +```text +aikubeworker0012: /root/test_gpu_scripts/reports/nccl_environment_snapshot_aikubeworker0012_20260523_111142.md +aikubeworker0016: /root/test_gpu_scripts/reports/nccl_environment_snapshot_aikubeworker0016_20260523_111143.md +``` + ## 当前证据摘要 ### HCA / rail diff --git a/scripts/nccl_environment_snapshot.sh b/scripts/nccl_environment_snapshot.sh new file mode 100644 index 0000000..77725ff --- /dev/null +++ b/scripts/nccl_environment_snapshot.sh @@ -0,0 +1,169 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Collect a lightweight NCCL/RDMA environment snapshot on one node. +# This script does not run NCCL workloads and is safe to use before deeper tests. + +HOST="$(hostname 2>/dev/null || echo unknown)" +TS="$(date +%Y%m%d_%H%M%S)" +OUT_FILE="${1:-${OUT_FILE:-/tmp/nccl_environment_snapshot_${HOST}_${TS}.md}}" +PDF_ALLREDUCE_BUSBW="${PDF_ALLREDUCE_BUSBW:-491.84}" +PDF_ALLTOALL_BUSBW="${PDF_ALLTOALL_BUSBW:-76.54}" +PLUGIN_SEARCH_ROOTS="${PLUGIN_SEARCH_ROOTS:-/usr /opt /tmp /root}" + +mkdir -p "$(dirname "$OUT_FILE")" +shopt -s nullglob + +have_cmd() { + command -v "$1" >/dev/null 2>&1 +} + +emit_cmd() { + local title="$1" + shift + { + echo + echo "### $title" + echo + echo '```text' + "$@" 2>&1 || true + echo '```' + } >>"$OUT_FILE" +} + +active_400g_hcas=() +non_400g_rows=() + +{ + echo "# NCCL/RDMA 环境快照" + echo + echo "- Host: \`$HOST\`" + echo "- Time: \`$(date -Is 2>/dev/null || date)\`" + echo "- Kernel: \`$(uname -r 2>/dev/null || echo unknown)\`" + echo + echo "## HCA / Port 状态" + echo + echo "| HCA | Port | State | Phys State | Rate | Link Layer | 400G IB Rail |" + echo "|---|---:|---|---|---:|---|---|" +} >"$OUT_FILE" + +hca_paths=(/sys/class/infiniband/mlx5_*) +if ((${#hca_paths[@]})); then + for hca_path in "${hca_paths[@]}"; do + hca="$(basename "$hca_path")" + for port_path in "$hca_path"/ports/*; do + [[ -d "$port_path" ]] || continue + port="$(basename "$port_path")" + state="$(cat "$port_path/state" 2>/dev/null || echo NA)" + phys_state="$(cat "$port_path/phys_state" 2>/dev/null || echo NA)" + rate="$(cat "$port_path/rate" 2>/dev/null || echo NA)" + layer="$(cat "$port_path/link_layer" 2>/dev/null || echo NA)" + is_400g="NO" + if [[ "$state" == *"ACTIVE"* && "$rate" == 400\ Gb/sec* && "$layer" == "InfiniBand" ]]; then + is_400g="YES" + active_400g_hcas+=("$hca") + else + non_400g_rows+=("$hca port=$port state=$state rate=$rate layer=$layer") + fi + printf '| `%s` | `%s` | `%s` | `%s` | `%s` | `%s` | `%s` |\n' \ + "$hca" "$port" "$state" "$phys_state" "$rate" "$layer" "$is_400g" >>"$OUT_FILE" + done + done +else + printf '| N/A | N/A | `%s` | N/A | N/A | N/A | NO |\n' "/sys/class/infiniband/mlx5_* not found" >>"$OUT_FILE" +fi + +{ + echo + echo "## Rail 摘要" + echo + if ((${#active_400g_hcas[@]})); then + hca_csv="$(IFS=,; echo "${active_400g_hcas[*]}")" + echo "- Active 400G IB rail count: \`${#active_400g_hcas[@]}\`" + echo "- Candidate \`NCCL_IB_HCA\`: \`$hca_csv\`" + echo "- Theoretical one-way raw bandwidth: \`${#active_400g_hcas[@]} * 400Gb/s / 8 = $((${#active_400g_hcas[@]} * 50)) GB/s\`" + else + echo "- Active 400G IB rail count: \`0\`" + echo "- Candidate \`NCCL_IB_HCA\`: \`N/A\`" + fi + echo + echo "Non-400G / non-IB / down ports:" + echo + if ((${#non_400g_rows[@]})); then + for row in "${non_400g_rows[@]}"; do + echo "- \`$row\`" + done + else + echo "- none" + fi + echo + echo "## PDF 目标换算" + echo + echo "- PDF allreduce busbw target: \`${PDF_ALLREDUCE_BUSBW} GB/s\`" + echo "- PDF alltoall busbw target: \`${PDF_ALLTOALL_BUSBW} GB/s\`" +} >>"$OUT_FILE" + +python3 - "$PDF_ALLREDUCE_BUSBW" "${#active_400g_hcas[@]}" >>"$OUT_FILE" <<'PY' || true +import sys + +busbw = float(sys.argv[1]) +rail_count = int(sys.argv[2]) +algbw = busbw / 1.875 +raw = rail_count * 50.0 +print(f"- 16-rank allreduce implied algbw: `{algbw:.2f} GB/s`") +if rail_count: + pct = algbw / raw * 100 + print(f"- Implied algbw / current raw 400G rail bandwidth: `{pct:.1f}%`") + if algbw > raw: + print("- Interpretation: PDF allreduce target is above current 400G rail one-way raw bandwidth.") + else: + print("- Interpretation: PDF allreduce target is within current 400G rail one-way raw bandwidth.") +else: + print("- Interpretation: no active 400G IB rail was detected.") +PY + +{ + echo + echo "## NCCL Net Plugin / SHARP 文件" + echo + echo '```text' +} >>"$OUT_FILE" + +read -r -a plugin_roots <<<"$PLUGIN_SEARCH_ROOTS" +find "${plugin_roots[@]}" \( -name 'libnccl-net*.so*' -o -name 'libsharp*.so*' \) \ + 2>/dev/null | sort >>"$OUT_FILE" || true + +if ! grep -q 'libnccl-net\|libsharp' "$OUT_FILE"; then + echo "none found under $PLUGIN_SEARCH_ROOTS" >>"$OUT_FILE" +fi + +echo '```' >>"$OUT_FILE" + +if have_cmd dpkg; then + emit_cmd "Relevant Debian packages" bash -lc "dpkg -l | egrep -i 'nccl|sharp|hcoll|ucx|ofed|mlnx' | sed -n '1,160p'" +else + emit_cmd "Relevant packages" bash -lc "echo 'dpkg not found'" +fi + +if have_cmd nvidia-smi; then + emit_cmd "nvidia-smi topo -m" nvidia-smi topo -m +else + emit_cmd "nvidia-smi topo -m" bash -lc "echo 'nvidia-smi not found'" +fi + +if have_cmd ibstat; then + emit_cmd "ibstat" ibstat +fi + +{ + echo + echo "## 建议判断" + echo + echo "1. 如果 Active 400G IB rail 少于 PDF 参考环境,不能直接按 PDF 阈值判断等价。" + echo "2. 如果没有 \`libnccl-net*.so*\` / \`libsharp*.so*\`,NCCL 可能只能走 internal IB plugin。" + echo "3. 若要追 PDF 2x8 目标,请先确认 rail 数量、SHARP/NCCL net plugin、跨 Leaf 交换策略是否与 PDF 环境一致。" + echo + echo "Snapshot written to: \`$OUT_FILE\`" +} >>"$OUT_FILE" + +echo "$OUT_FILE"