#!/usr/bin/env bash set -uo pipefail # Run a two-node, eight-GPU-per-node NCCL evidence pass across the six # collectives used by the single-node H100 acceptance flow. SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)" PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}" CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml}" OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}" FORMAT="${FORMAT:-md}" DRY_RUN=0 RUN_PREFLIGHT=1 PREFLIGHT_ONLY=0 usage() { cat <<'EOF' Usage: run_multinode_nccl_all_collectives.sh [options] Options: --python PATH Python executable (default: /root/gpu-test-venv/bin/python) --config PATH Config file (default: configs/multinode_nccl_nccl227_all_collectives_2x8.yaml) --out-dir PATH Report output directory (default: reports) --format FORMAT Report format: md, json, or html (default: md) --no-preflight Skip scripts/multinode_nccl_deep_diagnose.sh preflight --preflight-only Run only the preflight check, not the workload --dry-run Print commands without running them -h, --help Show this help EOF } while (($#)); do case "$1" in --python) PYTHON_BIN="$2" shift 2 ;; --config) CONFIG_FILE="$2" shift 2 ;; --out-dir) OUT_DIR="$2" shift 2 ;; --format) FORMAT="$2" shift 2 ;; --no-preflight) RUN_PREFLIGHT=0 shift ;; --preflight-only) PREFLIGHT_ONLY=1 shift ;; --dry-run) DRY_RUN=1 shift ;; -h|--help) usage exit 0 ;; *) echo "Unknown argument: $1" >&2 usage >&2 exit 2 ;; esac done if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then echo "Unsupported format: $FORMAT" >&2 exit 2 fi if [[ ! -x "$PYTHON_BIN" ]]; then PYTHON_BIN="$(command -v python3 || true)" fi if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then echo "Python executable not found. Set --python or PYTHON_BIN." >&2 exit 1 fi TS="$(date +%Y%m%d_%H%M%S)" mkdir -p "$OUT_DIR" REPORT_FILE="$OUT_DIR/multinode_nccl_all_collectives_${TS}.${FORMAT}" ARTIFACT_DIR="$OUT_DIR/multinode_nccl_all_collectives_${TS}_artifacts" PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight) RUN_CMD=( "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py" --config "$CONFIG_FILE" --test multinode-nccl --report --format "$FORMAT" --output "$REPORT_FILE" ) echo "Project: $PROJECT_DIR" echo "Config: $CONFIG_FILE" echo "Report: $REPORT_FILE" echo "Artifacts: $ARTIFACT_DIR" echo "Collectives: allreduce, alltoall, broadcast, reducescatter, allgather, sendrecv" echo "Topology: 2 nodes x 8 GPUs per node; 16G" if ((DRY_RUN)); then if ((RUN_PREFLIGHT)); then printf 'DRY RUN preflight:' printf ' %q' "${PREFLIGHT_CMD[@]}" printf '\n' fi if ((PREFLIGHT_ONLY)); then exit 0 fi printf 'DRY RUN workload:' printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR" printf ' %q' "${RUN_CMD[@]}" printf '\n' exit 0 fi if ((RUN_PREFLIGHT)); then "${PREFLIGHT_CMD[@]}" preflight_status=$? if ((preflight_status != 0)); then echo "Preflight failed with exit code $preflight_status" >&2 exit "$preflight_status" fi fi if ((PREFLIGHT_ONLY)); then exit 0 fi mkdir -p "$ARTIFACT_DIR" MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${RUN_CMD[@]}" status=$? echo "Report written to: $REPORT_FILE" echo "Artifacts written to: $ARTIFACT_DIR" exit "$status"