#!/usr/bin/env bash set -uo pipefail # Run the single-node H100 acceptance suite and keep the raw report paths stable. # The suite itself still lives in gpu_tester.py; this wrapper only standardizes # snapshot/report naming for repeated machine-level runs. SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)" PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}" CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/default.yaml}" OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}" FORMAT="${FORMAT:-md}" DRY_RUN=0 SNAPSHOT=1 usage() { cat <<'EOF' Usage: run_h100_single_node_all.sh [options] Options: --python PATH Python executable (default: /root/gpu-test-venv/bin/python) --config PATH gpu_tester config file (default: configs/default.yaml) --out-dir PATH Report output directory (default: reports) --format FORMAT Report format: md, json, or html (default: md) --no-snapshot Do not run nccl_environment_snapshot.sh first --dry-run Print commands without running them -h, --help Show this help EOF } while (($#)); do case "$1" in --python) PYTHON_BIN="$2" shift 2 ;; --config) CONFIG_FILE="$2" shift 2 ;; --out-dir) OUT_DIR="$2" shift 2 ;; --format) FORMAT="$2" shift 2 ;; --no-snapshot) SNAPSHOT=0 shift ;; --dry-run) DRY_RUN=1 shift ;; -h|--help) usage exit 0 ;; *) echo "Unknown argument: $1" >&2 usage >&2 exit 2 ;; esac done if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then echo "Unsupported format: $FORMAT" >&2 exit 2 fi if [[ ! -x "$PYTHON_BIN" ]]; then PYTHON_BIN="$(command -v python3 || true)" fi if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then echo "Python executable not found. Set --python or PYTHON_BIN." >&2 exit 1 fi HOST="$(hostname 2>/dev/null || echo unknown)" TS="$(date +%Y%m%d_%H%M%S)" mkdir -p "$OUT_DIR" SNAPSHOT_FILE="$OUT_DIR/nccl_environment_snapshot_${HOST}_${TS}.md" REPORT_FILE="$OUT_DIR/h100_single_node_all_${HOST}_${TS}.${FORMAT}" snapshot_cmd=(bash "$PROJECT_DIR/scripts/nccl_environment_snapshot.sh" "$SNAPSHOT_FILE") test_cmd=( "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py" --config "$CONFIG_FILE" --test all --report --format "$FORMAT" --output "$REPORT_FILE" ) echo "Project: $PROJECT_DIR" echo "Host: $HOST" echo "Config: $CONFIG_FILE" echo "Report: $REPORT_FILE" if ((SNAPSHOT)); then echo "Snapshot: $SNAPSHOT_FILE" fi if ((DRY_RUN)); then if ((SNAPSHOT)); then printf 'DRY RUN snapshot:' printf ' %q' "${snapshot_cmd[@]}" printf '\n' fi printf 'DRY RUN test:' printf ' %q' "${test_cmd[@]}" printf '\n' exit 0 fi if ((SNAPSHOT)); then "${snapshot_cmd[@]}" fi "${test_cmd[@]}" status=$? echo "Report written to: $REPORT_FILE" if ((SNAPSHOT)); then echo "Snapshot written to: $SNAPSHOT_FILE" fi exit "$status"