test_gpu_scripts/scripts/run_multinode_nccl_pdf_matrix.sh

143 lines
3.4 KiB
Bash
Executable File

#!/usr/bin/env bash
set -uo pipefail
# Run the formal cross-node NCCL PDF matrix for the current two-node H100 pair.
# This wrapper standardizes the command, output naming, and preflight hook; the
# actual benchmark implementation remains in gpu_tester.py / MultiNodeNCCLTest.
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/multinode_nccl_nccl227_pdf_matrix.yaml}"
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
FORMAT="${FORMAT:-md}"
DRY_RUN=0
RUN_PREFLIGHT=1
PREFLIGHT_ONLY=0
usage() {
cat <<'EOF'
Usage: run_multinode_nccl_pdf_matrix.sh [options]
Options:
--python PATH Python executable (default: /root/gpu-test-venv/bin/python)
--config PATH Matrix config file (default: configs/multinode_nccl_nccl227_pdf_matrix.yaml)
--out-dir PATH Report output directory (default: reports)
--format FORMAT Report format: md, json, or html (default: md)
--no-preflight Skip scripts/multinode_nccl_deep_diagnose.sh preflight
--preflight-only Run only the preflight check, not the matrix workload
--dry-run Print commands without running them
-h, --help Show this help
EOF
}
while (($#)); do
case "$1" in
--python)
PYTHON_BIN="$2"
shift 2
;;
--config)
CONFIG_FILE="$2"
shift 2
;;
--out-dir)
OUT_DIR="$2"
shift 2
;;
--format)
FORMAT="$2"
shift 2
;;
--no-preflight)
RUN_PREFLIGHT=0
shift
;;
--preflight-only)
PREFLIGHT_ONLY=1
shift
;;
--dry-run)
DRY_RUN=1
shift
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
echo "Unsupported format: $FORMAT" >&2
exit 2
fi
if [[ ! -x "$PYTHON_BIN" ]]; then
PYTHON_BIN="$(command -v python3 || true)"
fi
if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
echo "Python executable not found. Set --python or PYTHON_BIN." >&2
exit 1
fi
TS="$(date +%Y%m%d_%H%M%S)"
mkdir -p "$OUT_DIR"
REPORT_FILE="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}.${FORMAT}"
PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight)
MATRIX_CMD=(
"$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
--config "$CONFIG_FILE"
--test multinode-nccl
--report
--format "$FORMAT"
--output "$REPORT_FILE"
)
echo "Project: $PROJECT_DIR"
echo "Config: $CONFIG_FILE"
echo "Report: $REPORT_FILE"
echo "Matrix: 2 nodes x {1,2,4,8} GPUs per node; all_reduce_perf + alltoall_perf; 16G"
if ((DRY_RUN)); then
if ((RUN_PREFLIGHT)); then
printf 'DRY RUN preflight:'
printf ' %q' "${PREFLIGHT_CMD[@]}"
printf '\n'
fi
if ((PREFLIGHT_ONLY)); then
exit 0
fi
printf 'DRY RUN matrix:'
printf ' %q' "${MATRIX_CMD[@]}"
printf '\n'
exit 0
fi
if ((RUN_PREFLIGHT)); then
"${PREFLIGHT_CMD[@]}"
preflight_status=$?
if ((preflight_status != 0)); then
echo "Preflight failed with exit code $preflight_status" >&2
exit "$preflight_status"
fi
fi
if ((PREFLIGHT_ONLY)); then
exit 0
fi
"${MATRIX_CMD[@]}"
status=$?
echo "Report written to: $REPORT_FILE"
exit "$status"