#!/usr/bin/env bash set -euo pipefail # Deep-diagnose multi-node NCCL behavior from the coordinator node. # Default values match the current 2-node H100 cross-leaf investigation. MODE="${1:-all}" MPI_BIN="${MPI_BIN:-/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun}" NCCL_TESTS_DIR="${NCCL_TESTS_DIR:-/data/nccl-tests-latest/build}" HOSTS="${HOSTS:-172.72.8.12:8,172.72.8.16:8}" PEER_HOST="${PEER_HOST:-172.72.8.16}" SSH_USER="${SSH_USER:-root}" HCAS="${HCAS:-mlx5_0 mlx5_1 mlx5_6 mlx5_7}" HCA_CSV="${HCA_CSV:-mlx5_0,mlx5_1,mlx5_6,mlx5_7}" OUT_DIR="${OUT_DIR:-/tmp/nccl_deep_diagnose_$(date +%Y%m%d_%H%M%S)}" BEGIN_SIZE="${BEGIN_SIZE:-16G}" END_SIZE="${END_SIZE:-16G}" WARMUP_ITERS="${WARMUP_ITERS:-10}" ITERS="${ITERS:-10}" GRAPH_WARMUP_ITERS="${GRAPH_WARMUP_ITERS:-1}" GRAPH_ITERS="${GRAPH_ITERS:-1}" SWEEP_WARMUP_ITERS="${SWEEP_WARMUP_ITERS:-3}" SWEEP_ITERS="${SWEEP_ITERS:-5}" NCCL_LD_LIBRARY_PATH="${NCCL_LD_LIBRARY_PATH:-/usr/mpi/gcc/openmpi-4.1.9a1/lib:/tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu:/usr/local/cuda-12.4/targets/x86_64-linux/lib}" DEFAULT_NCCL_DEBUG="${NCCL_DEBUG:-WARN}" COUNTERS="${COUNTERS:-port_xmit_data port_rcv_data port_xmit_packets port_rcv_packets port_xmit_wait port_xmit_discards port_rcv_errors port_rcv_remote_physical_errors port_rcv_switch_relay_errors port_xmit_constraint_errors port_rcv_constraint_errors symbol_error link_error_recovery link_downed local_link_integrity_errors excessive_buffer_overrun_errors VL15_dropped}" HW_COUNTERS="${HW_COUNTERS:-roce_adp_retrans roce_adp_retrans_to roce_slow_restart roce_slow_restart_cnps roce_slow_restart_trans packet_seq_err out_of_sequence out_of_buffer duplicate_request implied_nak_seq_err local_ack_timeout_err req_transport_retries_exceeded rnr_nak_retry_err rx_write_requests rx_read_requests}" mkdir -p "$OUT_DIR" mpi_base=( "$MPI_BIN" --allow-run-as-root --mca btl_openib_warn_no_device_params_found 0 --mca btl_tcp_if_include bond0 --mca oob_tcp_if_include bond0 --mca plm_rsh_args "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes -o ConnectTimeout=10" -H "$HOSTS" --map-by ppr:8:node -np 16 ) base_exports=( LD_LIBRARY_PATH NCCL_IB_GID_INDEX NCCL_IB_SL NCCL_IB_TC NCCL_SOCKET_IFNAME NCCL_DEBUG NCCL_DEBUG_SUBSYS NCCL_IB_TIMEOUT NCCL_IB_HCA NCCL_NET_PLUGIN NCCL_NVLS_ENABLE NCCL_NET_GDR_LEVEL NCCL_NET_GDR_READ NCCL_DMABUF_ENABLE NCCL_PXN_DISABLE NCCL_IB_QPS_PER_CONNECTION NCCL_IB_SPLIT_DATA_ON_QPS NCCL_MIN_NCHANNELS NCCL_MAX_NCHANNELS NCCL_BUFFSIZE NCCL_P2P_NET_CHUNKSIZE NCCL_NCHANNELS_PER_NET_PEER NCCL_IB_AR_THRESHOLD ) set_common_env() { unset NCCL_DEBUG_SUBSYS NCCL_PXN_DISABLE NCCL_IB_QPS_PER_CONNECTION unset NCCL_IB_SPLIT_DATA_ON_QPS NCCL_MIN_NCHANNELS NCCL_MAX_NCHANNELS unset NCCL_BUFFSIZE NCCL_P2P_NET_CHUNKSIZE NCCL_NCHANNELS_PER_NET_PEER unset NCCL_IB_AR_THRESHOLD export LD_LIBRARY_PATH="$NCCL_LD_LIBRARY_PATH" export NCCL_IB_GID_INDEX="${NCCL_IB_GID_INDEX:-3}" export NCCL_IB_SL="${NCCL_IB_SL:-5}" export NCCL_IB_TC="${NCCL_IB_TC:-136}" export NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-bond0}" export NCCL_DEBUG="$DEFAULT_NCCL_DEBUG" export NCCL_IB_TIMEOUT="${NCCL_IB_TIMEOUT:-22}" export NCCL_IB_HCA="$HCA_CSV" export NCCL_NET_PLUGIN="${NCCL_NET_PLUGIN:-none}" export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-1}" export NCCL_NET_GDR_LEVEL="${NCCL_NET_GDR_LEVEL:-5}" export NCCL_NET_GDR_READ="${NCCL_NET_GDR_READ:-1}" export NCCL_DMABUF_ENABLE="${NCCL_DMABUF_ENABLE:-0}" } mpi_xargs() { for name in "${base_exports[@]}"; do if [[ -n "${!name+x}" ]]; then printf -- '-x\n%s\n' "$name" fi done } run_nccl() { local op="$1" local bin="$2" local log="$3" local warmup="$4" local iters="$5" mapfile -t xargs < <(mpi_xargs) "${mpi_base[@]}" "${xargs[@]}" \ "$bin" -b "$BEGIN_SIZE" -e "$END_SIZE" -g 1 -f 2 -w "$warmup" -n "$iters" \ >"$log" 2>&1 awk -v op="$op" '/Avg bus bandwidth/ {print op, $0}' "$log" } read_one_snapshot() { local host_label="$1" local out="$2" : >"$out" for hca in $HCAS; do for c in $COUNTERS; do local f="/sys/class/infiniband/$hca/ports/1/counters/$c" if [[ -r "$f" ]]; then printf '%s %s counters %s %s\n' "$host_label" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" >>"$out" fi done for c in $HW_COUNTERS; do local f="/sys/class/infiniband/$hca/ports/1/hw_counters/$c" if [[ -r "$f" ]]; then printf '%s %s hw_counters %s %s\n' "$host_label" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" >>"$out" fi done done } read_remote_snapshot() { local out="$1" ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ -o BatchMode=yes -o ConnectTimeout=5 "${SSH_USER}@${PEER_HOST}" \ "HCAS='$HCAS' COUNTERS='$COUNTERS' HW_COUNTERS='$HW_COUNTERS' bash -s" <<'EOS' >"$out" for hca in $HCAS; do for c in $COUNTERS; do f="/sys/class/infiniband/$hca/ports/1/counters/$c" if [ -r "$f" ]; then printf '%s %s counters %s %s\n' "$HOSTNAME" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" fi done for c in $HW_COUNTERS; do f="/sys/class/infiniband/$hca/ports/1/hw_counters/$c" if [ -r "$f" ]; then printf '%s %s hw_counters %s %s\n' "$HOSTNAME" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" fi done done EOS } summarize_counter_delta() { local before_a="$1" local before_b="$2" local after_a="$3" local after_b="$4" local out="$5" python3 - "$before_a" "$before_b" "$after_a" "$after_b" >"$out" <<'PY' import pathlib import sys interesting = { "port_xmit_wait", "port_xmit_discards", "port_rcv_errors", "port_rcv_remote_physical_errors", "port_rcv_switch_relay_errors", "port_xmit_constraint_errors", "port_rcv_constraint_errors", "symbol_error", "link_error_recovery", "link_downed", "local_link_integrity_errors", "excessive_buffer_overrun_errors", "VL15_dropped", "roce_adp_retrans", "roce_adp_retrans_to", "roce_slow_restart", "roce_slow_restart_cnps", "roce_slow_restart_trans", "packet_seq_err", "out_of_sequence", "out_of_buffer", "duplicate_request", "implied_nak_seq_err", "local_ack_timeout_err", "req_transport_retries_exceeded", "rnr_nak_retry_err", } def load(path): data = {} for line in pathlib.Path(path).read_text().splitlines(): parts = line.split() if len(parts) != 5: continue host, hca, kind, counter, value = parts try: data[(host, hca, kind, counter)] = int(value) except ValueError: pass return data before = {} after = {} before.update(load(sys.argv[1])) before.update(load(sys.argv[2])) after.update(load(sys.argv[3])) after.update(load(sys.argv[4])) print("NONZERO_DELTAS") for key in sorted(set(before) | set(after)): delta = after.get(key, 0) - before.get(key, 0) if not delta: continue host, hca, kind, counter = key if counter in {"port_xmit_data", "port_rcv_data"}: gib = delta * 4 / (1024 ** 3) print(f"{host} {hca} {kind} {counter} {delta} words4B {gib:.2f} GiB") else: print(f"{host} {hca} {kind} {counter} {delta}") print("ERROR_OR_CONGESTION_DELTAS") seen = False for key in sorted(set(before) | set(after)): delta = after.get(key, 0) - before.get(key, 0) if delta and key[3] in interesting: seen = True print(*key, delta) if not seen: print("none") PY } run_counter_case() { local op="$1" local bin="$2" local extra="${3:-}" set_common_env if [[ -n "$extra" ]]; then eval "export $extra" fi local dir="$OUT_DIR/${op}_counter" mkdir -p "$dir" read_one_snapshot "$(hostname)" "$dir/before.local" read_remote_snapshot "$dir/before.remote" run_nccl "$op" "$bin" "$dir/${op}.log" "$WARMUP_ITERS" "$ITERS" read_one_snapshot "$(hostname)" "$dir/after.local" read_remote_snapshot "$dir/after.remote" summarize_counter_delta "$dir/before.local" "$dir/before.remote" "$dir/after.local" "$dir/after.remote" "$dir/counter_delta.txt" echo "$dir" } summarize_graph_log() { local log="$1" local out="$2" python3 - "$log" >"$out" <<'PY' from pathlib import Path import collections import re import sys text = Path(sys.argv[1]).read_text(errors="ignore") print("avg_busbw", (re.findall(r"Avg bus bandwidth\s*:\s*([0-9.]+)", text) or ["NA"])[-1]) print("nccl_version", sorted(set(re.findall(r"NCCL version ([^\s]+)", text)))) print("plugin_missing", len(re.findall(r"Could not find: none libnccl-net-none\.so", text))) print("gdr_enabled_lines", len(re.findall(r"GPU Direct RDMA Enabled", text))) print("using_hca") for value, count in collections.Counter(re.findall(r"NET/IB : Using \[(.*?)\]; OOB", text)).most_common(4): print(f" {count} {value}") print("pattern_counts") patterns = re.findall( r"Pattern (\d+), crossNic (\d+), nChannels (\d+), bw ([0-9.]+)/([0-9.]+), type ([^,]+), sameChannels (\d+)", text, ) for key, count in collections.Counter(patterns).most_common(): print(f" {count} {key}") print("channel_summary") for value, count in collections.Counter( re.findall(r"(\d+ coll channels, \d+ collnet channels, \d+ nvls channels, \d+ p2p channels, \d+ p2p channels per peer)", text) ).most_common(): print(f" {count} {value}") print("p2p_chunks", collections.Counter(re.findall(r"P2P Chunksize set to (\d+)", text))) print("check_p2p", collections.Counter(re.findall(r"Check P2P Type ([^\n]+)", text))) for token in ["NET/IB/0/GDRDMA", "NET/IB/1/GDRDMA", "NET/IB/2/GDRDMA", "NET/IB/3/GDRDMA", "P2P/CUMEM", "P2P/IPC", "SHM"]: print(token, text.count(token)) print("channel_edge_lines", len([line for line in text.splitlines() if "Channel " in line and ("via NET/IB" in line or "via P2P" in line)])) PY } run_graph_case() { local op="$1" local bin="$2" local extra="${3:-}" set_common_env export NCCL_DEBUG=INFO export NCCL_DEBUG_SUBSYS=INIT,NET,GRAPH,TUNING,COLL if [[ -n "$extra" ]]; then eval "export $extra" fi local dir="$OUT_DIR/graph" mkdir -p "$dir" local log="$dir/${op}.log" run_nccl "$op" "$bin" "$log" "$GRAPH_WARMUP_ITERS" "$GRAPH_ITERS" summarize_graph_log "$log" "$dir/${op}_summary.txt" echo "$dir/${op}_summary.txt" } run_pxn_sweep() { local dir="$OUT_DIR/pxn_sweep" mkdir -p "$dir" local cases=( "baseline|" "nvls_off|NCCL_NVLS_ENABLE=0" "qps4_split1|NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=1" "qps8_split1|NCCL_IB_QPS_PER_CONNECTION=8 NCCL_IB_SPLIT_DATA_ON_QPS=1" "qps4_split0|NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=0" "channels16|NCCL_MIN_NCHANNELS=16 NCCL_MAX_NCHANNELS=16" "buff8m|NCCL_BUFFSIZE=8388608" "p2pchunk4m|NCCL_P2P_NET_CHUNKSIZE=4194304" "netpeer8|NCCL_NCHANNELS_PER_NET_PEER=8" "ar0|NCCL_IB_AR_THRESHOLD=0" ) : >"$dir/summary.txt" for item in "${cases[@]}"; do local name="${item%%|*}" local extra="${item#*|}" set_common_env export NCCL_PXN_DISABLE=1 if [[ -n "$extra" ]]; then eval "export $extra" fi local log="$dir/${name}.log" { echo "===== CASE $name =====" echo "extra: ${extra:-none}" run_nccl "alltoall" "$NCCL_TESTS_DIR/alltoall_perf" "$log" "$SWEEP_WARMUP_ITERS" "$SWEEP_ITERS" awk '/Avg bus bandwidth/ {print}' "$log" | tail -1 } | tee -a "$dir/summary.txt" done echo "$dir/summary.txt" } run_preflight() { set_common_env local out="$OUT_DIR/preflight.txt" { echo "===== LOCAL =====" echo "hostname: $(hostname)" echo "mpirun: $MPI_BIN" if [[ -x "$MPI_BIN" ]]; then "$MPI_BIN" --version 2>&1 | sed -n '1p' else echo "MISSING executable: $MPI_BIN" fi for bin in "$NCCL_TESTS_DIR/all_reduce_perf" "$NCCL_TESTS_DIR/alltoall_perf"; do if [[ -x "$bin" ]]; then echo "OK executable: $bin" else echo "MISSING executable: $bin" fi done for hca in $HCAS; do local state="/sys/class/infiniband/$hca/ports/1/state" local rate="/sys/class/infiniband/$hca/ports/1/rate" if [[ -r "$state" ]]; then echo "OK HCA: $hca state=$(cat "$state") rate=$(cat "$rate" 2>/dev/null || echo unknown)" else echo "MISSING HCA path: $hca" fi done echo "===== REMOTE =====" ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ -o BatchMode=yes -o ConnectTimeout=5 "${SSH_USER}@${PEER_HOST}" \ "MPI_BIN='$MPI_BIN' NCCL_TESTS_DIR='$NCCL_TESTS_DIR' HCAS='$HCAS' bash -s" <<'EOS' echo "hostname: $(hostname)" echo "mpirun: $MPI_BIN" if [ -x "$MPI_BIN" ]; then "$MPI_BIN" --version 2>&1 | sed -n '1p' else echo "MISSING executable: $MPI_BIN" fi for bin in "$NCCL_TESTS_DIR/all_reduce_perf" "$NCCL_TESTS_DIR/alltoall_perf"; do if [ -x "$bin" ]; then echo "OK executable: $bin" else echo "MISSING executable: $bin" fi done for hca in $HCAS; do state="/sys/class/infiniband/$hca/ports/1/state" rate="/sys/class/infiniband/$hca/ports/1/rate" if [ -r "$state" ]; then echo "OK HCA: $hca state=$(cat "$state") rate=$(cat "$rate" 2>/dev/null || echo unknown)" else echo "MISSING HCA path: $hca" fi done EOS } | tee "$out" echo "$out" } usage() { cat <