Add single-node H100 all runner

2026-05-23 19:16:40 +08:00 · 2026-05-23 19:16:40 +08:00 · 2c5c31e451
commit 2c5c31e451
parent cadfbcfaa3
3 changed files with 150 additions and 0 deletions
--- a/reports_multinode_nccl_handoff_plan_20260523.md
+++ b/reports_multinode_nccl_handoff_plan_20260523.md
@ -112,6 +112,13 @@ cd /root/test_gpu_scripts
 bash scripts/nccl_environment_snapshot.sh reports/nccl_environment_snapshot_$(hostname)_$(date +%Y%m%d_%H%M%S).md
 ```

+### 单节点 H100 原始 all 报告
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/run_h100_single_node_all.sh
+```
+
 ### 完整深度诊断

 ```bash
@ -147,6 +154,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 | `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本 runbook |
 | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 |
 | `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 |
+| `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 |

 ## 当前建议

--- a/reports_multinode_nccl_latest_index_20260523.md
+++ b/reports_multinode_nccl_latest_index_20260523.md
@ -27,8 +27,16 @@
 |---|---|
 | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 |
 | `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本，不启动 NCCL workload |
+| `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口，默认同时采环境快照 |
 | `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |

+单节点 H100 原始 all 报告：
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/run_h100_single_node_all.sh
+```
+
 推荐先跑轻量检查：

 ```bash
--- a/scripts/run_h100_single_node_all.sh
+++ b/scripts/run_h100_single_node_all.sh
@ -0,0 +1,134 @@
+#!/usr/bin/env bash
+set -uo pipefail
+
+# Run the single-node H100 acceptance suite and keep the raw report paths stable.
+# The suite itself still lives in gpu_tester.py; this wrapper only standardizes
+# snapshot/report naming for repeated machine-level runs.
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
+
+PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
+CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/default.yaml}"
+OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
+FORMAT="${FORMAT:-md}"
+DRY_RUN=0
+SNAPSHOT=1
+
+usage() {
+  cat <<'EOF'
+Usage: run_h100_single_node_all.sh [options]
+
+Options:
+  --python PATH       Python executable (default: /root/gpu-test-venv/bin/python)
+  --config PATH       gpu_tester config file (default: configs/default.yaml)
+  --out-dir PATH      Report output directory (default: reports)
+  --format FORMAT     Report format: md, json, or html (default: md)
+  --no-snapshot       Do not run nccl_environment_snapshot.sh first
+  --dry-run           Print commands without running them
+  -h, --help          Show this help
+EOF
+}
+
+while (($#)); do
+  case "$1" in
+    --python)
+      PYTHON_BIN="$2"
+      shift 2
+      ;;
+    --config)
+      CONFIG_FILE="$2"
+      shift 2
+      ;;
+    --out-dir)
+      OUT_DIR="$2"
+      shift 2
+      ;;
+    --format)
+      FORMAT="$2"
+      shift 2
+      ;;
+    --no-snapshot)
+      SNAPSHOT=0
+      shift
+      ;;
+    --dry-run)
+      DRY_RUN=1
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      usage >&2
+      exit 2
+      ;;
+  esac
+done
+
+if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
+  echo "Unsupported format: $FORMAT" >&2
+  exit 2
+fi
+
+if [[ ! -x "$PYTHON_BIN" ]]; then
+  PYTHON_BIN="$(command -v python3 || true)"
+fi
+
+if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
+  echo "Python executable not found. Set --python or PYTHON_BIN." >&2
+  exit 1
+fi
+
+HOST="$(hostname 2>/dev/null || echo unknown)"
+TS="$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUT_DIR"
+
+SNAPSHOT_FILE="$OUT_DIR/nccl_environment_snapshot_${HOST}_${TS}.md"
+REPORT_FILE="$OUT_DIR/h100_single_node_all_${HOST}_${TS}.${FORMAT}"
+
+snapshot_cmd=(bash "$PROJECT_DIR/scripts/nccl_environment_snapshot.sh" "$SNAPSHOT_FILE")
+test_cmd=(
+  "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
+  --config "$CONFIG_FILE"
+  --test all
+  --report
+  --format "$FORMAT"
+  --output "$REPORT_FILE"
+)
+
+echo "Project: $PROJECT_DIR"
+echo "Host: $HOST"
+echo "Config: $CONFIG_FILE"
+echo "Report: $REPORT_FILE"
+if ((SNAPSHOT)); then
+  echo "Snapshot: $SNAPSHOT_FILE"
+fi
+
+if ((DRY_RUN)); then
+  if ((SNAPSHOT)); then
+    printf 'DRY RUN snapshot:'
+    printf ' %q' "${snapshot_cmd[@]}"
+    printf '\n'
+  fi
+  printf 'DRY RUN test:'
+  printf ' %q' "${test_cmd[@]}"
+  printf '\n'
+  exit 0
+fi
+
+if ((SNAPSHOT)); then
+  "${snapshot_cmd[@]}"
+fi
+
+"${test_cmd[@]}"
+status=$?
+
+echo "Report written to: $REPORT_FILE"
+if ((SNAPSHOT)); then
+  echo "Snapshot written to: $SNAPSHOT_FILE"
+fi
+
+exit "$status"