test_gpu_scripts/scripts/multinode_nccl_deep_diagnose.sh

426 lines
14 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
# Deep-diagnose multi-node NCCL behavior from the coordinator node.
# Default values match the current 2-node H100 cross-leaf investigation.
MODE="${1:-all}"
MPI_BIN="${MPI_BIN:-/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun}"
NCCL_TESTS_DIR="${NCCL_TESTS_DIR:-/data/nccl-tests-latest/build}"
HOSTS="${HOSTS:-172.72.8.12:8,172.72.8.16:8}"
PEER_HOST="${PEER_HOST:-172.72.8.16}"
SSH_USER="${SSH_USER:-root}"
HCAS="${HCAS:-mlx5_0 mlx5_1 mlx5_6 mlx5_7}"
HCA_CSV="${HCA_CSV:-mlx5_0,mlx5_1,mlx5_6,mlx5_7}"
OUT_DIR="${OUT_DIR:-/tmp/nccl_deep_diagnose_$(date +%Y%m%d_%H%M%S)}"
BEGIN_SIZE="${BEGIN_SIZE:-16G}"
END_SIZE="${END_SIZE:-16G}"
WARMUP_ITERS="${WARMUP_ITERS:-10}"
ITERS="${ITERS:-10}"
GRAPH_WARMUP_ITERS="${GRAPH_WARMUP_ITERS:-1}"
GRAPH_ITERS="${GRAPH_ITERS:-1}"
SWEEP_WARMUP_ITERS="${SWEEP_WARMUP_ITERS:-3}"
SWEEP_ITERS="${SWEEP_ITERS:-5}"
NCCL_LD_LIBRARY_PATH="${NCCL_LD_LIBRARY_PATH:-/usr/mpi/gcc/openmpi-4.1.9a1/lib:/tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu:/usr/local/cuda-12.4/targets/x86_64-linux/lib}"
DEFAULT_NCCL_DEBUG="${NCCL_DEBUG:-WARN}"
COUNTERS="${COUNTERS:-port_xmit_data port_rcv_data port_xmit_packets port_rcv_packets port_xmit_wait port_xmit_discards port_rcv_errors port_rcv_remote_physical_errors port_rcv_switch_relay_errors port_xmit_constraint_errors port_rcv_constraint_errors symbol_error link_error_recovery link_downed local_link_integrity_errors excessive_buffer_overrun_errors VL15_dropped}"
HW_COUNTERS="${HW_COUNTERS:-roce_adp_retrans roce_adp_retrans_to roce_slow_restart roce_slow_restart_cnps roce_slow_restart_trans packet_seq_err out_of_sequence out_of_buffer duplicate_request implied_nak_seq_err local_ack_timeout_err req_transport_retries_exceeded rnr_nak_retry_err rx_write_requests rx_read_requests}"
mkdir -p "$OUT_DIR"
mpi_base=(
"$MPI_BIN"
--allow-run-as-root
--mca btl_openib_warn_no_device_params_found 0
--mca btl_tcp_if_include bond0
--mca oob_tcp_if_include bond0
--mca plm_rsh_args "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes -o ConnectTimeout=10"
-H "$HOSTS"
--map-by ppr:8:node
-np 16
)
base_exports=(
LD_LIBRARY_PATH
NCCL_IB_GID_INDEX NCCL_IB_SL NCCL_IB_TC NCCL_SOCKET_IFNAME
NCCL_DEBUG NCCL_DEBUG_SUBSYS NCCL_IB_TIMEOUT NCCL_IB_HCA
NCCL_NET_PLUGIN NCCL_NVLS_ENABLE NCCL_NET_GDR_LEVEL NCCL_NET_GDR_READ
NCCL_DMABUF_ENABLE NCCL_PXN_DISABLE NCCL_IB_QPS_PER_CONNECTION
NCCL_IB_SPLIT_DATA_ON_QPS NCCL_MIN_NCHANNELS NCCL_MAX_NCHANNELS
NCCL_BUFFSIZE NCCL_P2P_NET_CHUNKSIZE NCCL_NCHANNELS_PER_NET_PEER
NCCL_IB_AR_THRESHOLD
)
set_common_env() {
unset NCCL_DEBUG_SUBSYS NCCL_PXN_DISABLE NCCL_IB_QPS_PER_CONNECTION
unset NCCL_IB_SPLIT_DATA_ON_QPS NCCL_MIN_NCHANNELS NCCL_MAX_NCHANNELS
unset NCCL_BUFFSIZE NCCL_P2P_NET_CHUNKSIZE NCCL_NCHANNELS_PER_NET_PEER
unset NCCL_IB_AR_THRESHOLD
export LD_LIBRARY_PATH="$NCCL_LD_LIBRARY_PATH"
export NCCL_IB_GID_INDEX="${NCCL_IB_GID_INDEX:-3}"
export NCCL_IB_SL="${NCCL_IB_SL:-5}"
export NCCL_IB_TC="${NCCL_IB_TC:-136}"
export NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-bond0}"
export NCCL_DEBUG="$DEFAULT_NCCL_DEBUG"
export NCCL_IB_TIMEOUT="${NCCL_IB_TIMEOUT:-22}"
export NCCL_IB_HCA="$HCA_CSV"
export NCCL_NET_PLUGIN="${NCCL_NET_PLUGIN:-none}"
export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-1}"
export NCCL_NET_GDR_LEVEL="${NCCL_NET_GDR_LEVEL:-5}"
export NCCL_NET_GDR_READ="${NCCL_NET_GDR_READ:-1}"
export NCCL_DMABUF_ENABLE="${NCCL_DMABUF_ENABLE:-0}"
}
mpi_xargs() {
for name in "${base_exports[@]}"; do
if [[ -n "${!name+x}" ]]; then
printf -- '-x\n%s\n' "$name"
fi
done
}
run_nccl() {
local op="$1"
local bin="$2"
local log="$3"
local warmup="$4"
local iters="$5"
mapfile -t xargs < <(mpi_xargs)
"${mpi_base[@]}" "${xargs[@]}" \
"$bin" -b "$BEGIN_SIZE" -e "$END_SIZE" -g 1 -f 2 -w "$warmup" -n "$iters" \
>"$log" 2>&1
awk -v op="$op" '/Avg bus bandwidth/ {print op, $0}' "$log"
}
read_one_snapshot() {
local host_label="$1"
local out="$2"
: >"$out"
for hca in $HCAS; do
for c in $COUNTERS; do
local f="/sys/class/infiniband/$hca/ports/1/counters/$c"
if [[ -r "$f" ]]; then
printf '%s %s counters %s %s\n' "$host_label" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" >>"$out"
fi
done
for c in $HW_COUNTERS; do
local f="/sys/class/infiniband/$hca/ports/1/hw_counters/$c"
if [[ -r "$f" ]]; then
printf '%s %s hw_counters %s %s\n' "$host_label" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" >>"$out"
fi
done
done
}
read_remote_snapshot() {
local out="$1"
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o BatchMode=yes -o ConnectTimeout=5 "${SSH_USER}@${PEER_HOST}" \
"HCAS='$HCAS' COUNTERS='$COUNTERS' HW_COUNTERS='$HW_COUNTERS' bash -s" <<'EOS' >"$out"
for hca in $HCAS; do
for c in $COUNTERS; do
f="/sys/class/infiniband/$hca/ports/1/counters/$c"
if [ -r "$f" ]; then
printf '%s %s counters %s %s\n' "$HOSTNAME" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)"
fi
done
for c in $HW_COUNTERS; do
f="/sys/class/infiniband/$hca/ports/1/hw_counters/$c"
if [ -r "$f" ]; then
printf '%s %s hw_counters %s %s\n' "$HOSTNAME" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)"
fi
done
done
EOS
}
summarize_counter_delta() {
local before_a="$1"
local before_b="$2"
local after_a="$3"
local after_b="$4"
local out="$5"
python3 - "$before_a" "$before_b" "$after_a" "$after_b" >"$out" <<'PY'
import pathlib
import sys
interesting = {
"port_xmit_wait", "port_xmit_discards", "port_rcv_errors",
"port_rcv_remote_physical_errors", "port_rcv_switch_relay_errors",
"port_xmit_constraint_errors", "port_rcv_constraint_errors",
"symbol_error", "link_error_recovery", "link_downed",
"local_link_integrity_errors", "excessive_buffer_overrun_errors",
"VL15_dropped", "roce_adp_retrans", "roce_adp_retrans_to",
"roce_slow_restart", "roce_slow_restart_cnps", "roce_slow_restart_trans",
"packet_seq_err", "out_of_sequence", "out_of_buffer",
"duplicate_request", "implied_nak_seq_err", "local_ack_timeout_err",
"req_transport_retries_exceeded", "rnr_nak_retry_err",
}
def load(path):
data = {}
for line in pathlib.Path(path).read_text().splitlines():
parts = line.split()
if len(parts) != 5:
continue
host, hca, kind, counter, value = parts
try:
data[(host, hca, kind, counter)] = int(value)
except ValueError:
pass
return data
before = {}
after = {}
before.update(load(sys.argv[1]))
before.update(load(sys.argv[2]))
after.update(load(sys.argv[3]))
after.update(load(sys.argv[4]))
print("NONZERO_DELTAS")
for key in sorted(set(before) | set(after)):
delta = after.get(key, 0) - before.get(key, 0)
if not delta:
continue
host, hca, kind, counter = key
if counter in {"port_xmit_data", "port_rcv_data"}:
gib = delta * 4 / (1024 ** 3)
print(f"{host} {hca} {kind} {counter} {delta} words4B {gib:.2f} GiB")
else:
print(f"{host} {hca} {kind} {counter} {delta}")
print("ERROR_OR_CONGESTION_DELTAS")
seen = False
for key in sorted(set(before) | set(after)):
delta = after.get(key, 0) - before.get(key, 0)
if delta and key[3] in interesting:
seen = True
print(*key, delta)
if not seen:
print("none")
PY
}
run_counter_case() {
local op="$1"
local bin="$2"
local extra="${3:-}"
set_common_env
if [[ -n "$extra" ]]; then
eval "export $extra"
fi
local dir="$OUT_DIR/${op}_counter"
mkdir -p "$dir"
read_one_snapshot "$(hostname)" "$dir/before.local"
read_remote_snapshot "$dir/before.remote"
run_nccl "$op" "$bin" "$dir/${op}.log" "$WARMUP_ITERS" "$ITERS"
read_one_snapshot "$(hostname)" "$dir/after.local"
read_remote_snapshot "$dir/after.remote"
summarize_counter_delta "$dir/before.local" "$dir/before.remote" "$dir/after.local" "$dir/after.remote" "$dir/counter_delta.txt"
echo "$dir"
}
summarize_graph_log() {
local log="$1"
local out="$2"
python3 - "$log" >"$out" <<'PY'
from pathlib import Path
import collections
import re
import sys
text = Path(sys.argv[1]).read_text(errors="ignore")
print("avg_busbw", (re.findall(r"Avg bus bandwidth\s*:\s*([0-9.]+)", text) or ["NA"])[-1])
print("nccl_version", sorted(set(re.findall(r"NCCL version ([^\s]+)", text))))
print("plugin_missing", len(re.findall(r"Could not find: none libnccl-net-none\.so", text)))
print("gdr_enabled_lines", len(re.findall(r"GPU Direct RDMA Enabled", text)))
print("using_hca")
for value, count in collections.Counter(re.findall(r"NET/IB : Using \[(.*?)\]; OOB", text)).most_common(4):
print(f" {count} {value}")
print("pattern_counts")
patterns = re.findall(
r"Pattern (\d+), crossNic (\d+), nChannels (\d+), bw ([0-9.]+)/([0-9.]+), type ([^,]+), sameChannels (\d+)",
text,
)
for key, count in collections.Counter(patterns).most_common():
print(f" {count} {key}")
print("channel_summary")
for value, count in collections.Counter(
re.findall(r"(\d+ coll channels, \d+ collnet channels, \d+ nvls channels, \d+ p2p channels, \d+ p2p channels per peer)", text)
).most_common():
print(f" {count} {value}")
print("p2p_chunks", collections.Counter(re.findall(r"P2P Chunksize set to (\d+)", text)))
print("check_p2p", collections.Counter(re.findall(r"Check P2P Type ([^\n]+)", text)))
for token in ["NET/IB/0/GDRDMA", "NET/IB/1/GDRDMA", "NET/IB/2/GDRDMA", "NET/IB/3/GDRDMA", "P2P/CUMEM", "P2P/IPC", "SHM"]:
print(token, text.count(token))
print("channel_edge_lines", len([line for line in text.splitlines() if "Channel " in line and ("via NET/IB" in line or "via P2P" in line)]))
PY
}
run_graph_case() {
local op="$1"
local bin="$2"
local extra="${3:-}"
set_common_env
export NCCL_DEBUG=INFO
export NCCL_DEBUG_SUBSYS=INIT,NET,GRAPH,TUNING,COLL
if [[ -n "$extra" ]]; then
eval "export $extra"
fi
local dir="$OUT_DIR/graph"
mkdir -p "$dir"
local log="$dir/${op}.log"
run_nccl "$op" "$bin" "$log" "$GRAPH_WARMUP_ITERS" "$GRAPH_ITERS"
summarize_graph_log "$log" "$dir/${op}_summary.txt"
echo "$dir/${op}_summary.txt"
}
run_pxn_sweep() {
local dir="$OUT_DIR/pxn_sweep"
mkdir -p "$dir"
local cases=(
"baseline|"
"nvls_off|NCCL_NVLS_ENABLE=0"
"qps4_split1|NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=1"
"qps8_split1|NCCL_IB_QPS_PER_CONNECTION=8 NCCL_IB_SPLIT_DATA_ON_QPS=1"
"qps4_split0|NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=0"
"channels16|NCCL_MIN_NCHANNELS=16 NCCL_MAX_NCHANNELS=16"
"buff8m|NCCL_BUFFSIZE=8388608"
"p2pchunk4m|NCCL_P2P_NET_CHUNKSIZE=4194304"
"netpeer8|NCCL_NCHANNELS_PER_NET_PEER=8"
"ar0|NCCL_IB_AR_THRESHOLD=0"
)
: >"$dir/summary.txt"
for item in "${cases[@]}"; do
local name="${item%%|*}"
local extra="${item#*|}"
set_common_env
export NCCL_PXN_DISABLE=1
if [[ -n "$extra" ]]; then
eval "export $extra"
fi
local log="$dir/${name}.log"
{
echo "===== CASE $name ====="
echo "extra: ${extra:-none}"
run_nccl "alltoall" "$NCCL_TESTS_DIR/alltoall_perf" "$log" "$SWEEP_WARMUP_ITERS" "$SWEEP_ITERS"
awk '/Avg bus bandwidth/ {print}' "$log" | tail -1
} | tee -a "$dir/summary.txt"
done
echo "$dir/summary.txt"
}
run_preflight() {
set_common_env
local out="$OUT_DIR/preflight.txt"
{
echo "===== LOCAL ====="
echo "hostname: $(hostname)"
echo "mpirun: $MPI_BIN"
if [[ -x "$MPI_BIN" ]]; then
"$MPI_BIN" --version 2>&1 | sed -n '1p'
else
echo "MISSING executable: $MPI_BIN"
fi
for bin in "$NCCL_TESTS_DIR/all_reduce_perf" "$NCCL_TESTS_DIR/alltoall_perf"; do
if [[ -x "$bin" ]]; then
echo "OK executable: $bin"
else
echo "MISSING executable: $bin"
fi
done
for hca in $HCAS; do
local state="/sys/class/infiniband/$hca/ports/1/state"
local rate="/sys/class/infiniband/$hca/ports/1/rate"
if [[ -r "$state" ]]; then
echo "OK HCA: $hca state=$(cat "$state") rate=$(cat "$rate" 2>/dev/null || echo unknown)"
else
echo "MISSING HCA path: $hca"
fi
done
echo "===== REMOTE ====="
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o BatchMode=yes -o ConnectTimeout=5 "${SSH_USER}@${PEER_HOST}" \
"MPI_BIN='$MPI_BIN' NCCL_TESTS_DIR='$NCCL_TESTS_DIR' HCAS='$HCAS' bash -s" <<'EOS'
echo "hostname: $(hostname)"
echo "mpirun: $MPI_BIN"
if [ -x "$MPI_BIN" ]; then
"$MPI_BIN" --version 2>&1 | sed -n '1p'
else
echo "MISSING executable: $MPI_BIN"
fi
for bin in "$NCCL_TESTS_DIR/all_reduce_perf" "$NCCL_TESTS_DIR/alltoall_perf"; do
if [ -x "$bin" ]; then
echo "OK executable: $bin"
else
echo "MISSING executable: $bin"
fi
done
for hca in $HCAS; do
state="/sys/class/infiniband/$hca/ports/1/state"
rate="/sys/class/infiniband/$hca/ports/1/rate"
if [ -r "$state" ]; then
echo "OK HCA: $hca state=$(cat "$state") rate=$(cat "$rate" 2>/dev/null || echo unknown)"
else
echo "MISSING HCA path: $hca"
fi
done
EOS
} | tee "$out"
echo "$out"
}
usage() {
cat <<EOF
Usage: $0 [preflight|all|allreduce-counter|alltoall-counter|graph|pxn-sweep]
Outputs are written to: $OUT_DIR
Common overrides:
HOSTS, PEER_HOST, HCAS, HCA_CSV, MPI_BIN, NCCL_TESTS_DIR,
NCCL_LD_LIBRARY_PATH, BEGIN_SIZE, END_SIZE, WARMUP_ITERS, ITERS
EOF
}
case "$MODE" in
preflight)
run_preflight
;;
all)
run_preflight
run_counter_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
run_counter_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
run_graph_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
run_graph_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
run_pxn_sweep
;;
allreduce-counter)
run_counter_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
;;
alltoall-counter)
run_counter_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
;;
graph)
run_graph_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
run_graph_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
;;
pxn-sweep)
run_pxn_sweep
;;
-h|--help|help)
usage
;;
*)
usage
exit 2
;;
esac
echo "OUT_DIR=$OUT_DIR"