diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md index fb4e354..9b639ad 100644 --- a/reports_multinode_nccl_handoff_plan_20260523.md +++ b/reports_multinode_nccl_handoff_plan_20260523.md @@ -112,6 +112,13 @@ cd /root/test_gpu_scripts bash scripts/nccl_environment_snapshot.sh reports/nccl_environment_snapshot_$(hostname)_$(date +%Y%m%d_%H%M%S).md ``` +### 单节点 H100 原始 all 报告 + +```bash +cd /root/test_gpu_scripts +bash scripts/run_h100_single_node_all.sh +``` + ### 完整深度诊断 ```bash @@ -147,6 +154,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m% | `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本 runbook | | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 | | `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 | +| `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 | ## 当前建议 diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md index 4ccbc23..2aa9bd3 100644 --- a/reports_multinode_nccl_latest_index_20260523.md +++ b/reports_multinode_nccl_latest_index_20260523.md @@ -27,8 +27,16 @@ |---|---| | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 | | `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本,不启动 NCCL workload | +| `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口,默认同时采环境快照 | | `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook | +单节点 H100 原始 all 报告: + +```bash +cd /root/test_gpu_scripts +bash scripts/run_h100_single_node_all.sh +``` + 推荐先跑轻量检查: ```bash diff --git a/scripts/run_h100_single_node_all.sh b/scripts/run_h100_single_node_all.sh new file mode 100755 index 0000000..91d25fe --- /dev/null +++ b/scripts/run_h100_single_node_all.sh @@ -0,0 +1,134 @@ +#!/usr/bin/env bash +set -uo pipefail + +# Run the single-node H100 acceptance suite and keep the raw report paths stable. +# The suite itself still lives in gpu_tester.py; this wrapper only standardizes +# snapshot/report naming for repeated machine-level runs. + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)" + +PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}" +CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/default.yaml}" +OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}" +FORMAT="${FORMAT:-md}" +DRY_RUN=0 +SNAPSHOT=1 + +usage() { + cat <<'EOF' +Usage: run_h100_single_node_all.sh [options] + +Options: + --python PATH Python executable (default: /root/gpu-test-venv/bin/python) + --config PATH gpu_tester config file (default: configs/default.yaml) + --out-dir PATH Report output directory (default: reports) + --format FORMAT Report format: md, json, or html (default: md) + --no-snapshot Do not run nccl_environment_snapshot.sh first + --dry-run Print commands without running them + -h, --help Show this help +EOF +} + +while (($#)); do + case "$1" in + --python) + PYTHON_BIN="$2" + shift 2 + ;; + --config) + CONFIG_FILE="$2" + shift 2 + ;; + --out-dir) + OUT_DIR="$2" + shift 2 + ;; + --format) + FORMAT="$2" + shift 2 + ;; + --no-snapshot) + SNAPSHOT=0 + shift + ;; + --dry-run) + DRY_RUN=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage >&2 + exit 2 + ;; + esac +done + +if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then + echo "Unsupported format: $FORMAT" >&2 + exit 2 +fi + +if [[ ! -x "$PYTHON_BIN" ]]; then + PYTHON_BIN="$(command -v python3 || true)" +fi + +if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then + echo "Python executable not found. Set --python or PYTHON_BIN." >&2 + exit 1 +fi + +HOST="$(hostname 2>/dev/null || echo unknown)" +TS="$(date +%Y%m%d_%H%M%S)" +mkdir -p "$OUT_DIR" + +SNAPSHOT_FILE="$OUT_DIR/nccl_environment_snapshot_${HOST}_${TS}.md" +REPORT_FILE="$OUT_DIR/h100_single_node_all_${HOST}_${TS}.${FORMAT}" + +snapshot_cmd=(bash "$PROJECT_DIR/scripts/nccl_environment_snapshot.sh" "$SNAPSHOT_FILE") +test_cmd=( + "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py" + --config "$CONFIG_FILE" + --test all + --report + --format "$FORMAT" + --output "$REPORT_FILE" +) + +echo "Project: $PROJECT_DIR" +echo "Host: $HOST" +echo "Config: $CONFIG_FILE" +echo "Report: $REPORT_FILE" +if ((SNAPSHOT)); then + echo "Snapshot: $SNAPSHOT_FILE" +fi + +if ((DRY_RUN)); then + if ((SNAPSHOT)); then + printf 'DRY RUN snapshot:' + printf ' %q' "${snapshot_cmd[@]}" + printf '\n' + fi + printf 'DRY RUN test:' + printf ' %q' "${test_cmd[@]}" + printf '\n' + exit 0 +fi + +if ((SNAPSHOT)); then + "${snapshot_cmd[@]}" +fi + +"${test_cmd[@]}" +status=$? + +echo "Report written to: $REPORT_FILE" +if ((SNAPSHOT)); then + echo "Snapshot written to: $SNAPSHOT_FILE" +fi + +exit "$status"