Add single-node H100 all runner
This commit is contained in:
parent
cadfbcfaa3
commit
2c5c31e451
@ -112,6 +112,13 @@ cd /root/test_gpu_scripts
|
||||
bash scripts/nccl_environment_snapshot.sh reports/nccl_environment_snapshot_$(hostname)_$(date +%Y%m%d_%H%M%S).md
|
||||
```
|
||||
|
||||
### 单节点 H100 原始 all 报告
|
||||
|
||||
```bash
|
||||
cd /root/test_gpu_scripts
|
||||
bash scripts/run_h100_single_node_all.sh
|
||||
```
|
||||
|
||||
### 完整深度诊断
|
||||
|
||||
```bash
|
||||
@ -147,6 +154,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
|
||||
| `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本 runbook |
|
||||
| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 |
|
||||
| `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 |
|
||||
| `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 |
|
||||
|
||||
## 当前建议
|
||||
|
||||
|
||||
@ -27,8 +27,16 @@
|
||||
|---|---|
|
||||
| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 |
|
||||
| `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本,不启动 NCCL workload |
|
||||
| `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口,默认同时采环境快照 |
|
||||
| `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |
|
||||
|
||||
单节点 H100 原始 all 报告:
|
||||
|
||||
```bash
|
||||
cd /root/test_gpu_scripts
|
||||
bash scripts/run_h100_single_node_all.sh
|
||||
```
|
||||
|
||||
推荐先跑轻量检查:
|
||||
|
||||
```bash
|
||||
|
||||
134
scripts/run_h100_single_node_all.sh
Executable file
134
scripts/run_h100_single_node_all.sh
Executable file
@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
|
||||
# Run the single-node H100 acceptance suite and keep the raw report paths stable.
|
||||
# The suite itself still lives in gpu_tester.py; this wrapper only standardizes
|
||||
# snapshot/report naming for repeated machine-level runs.
|
||||
|
||||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
||||
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
|
||||
|
||||
PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
|
||||
CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/default.yaml}"
|
||||
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
|
||||
FORMAT="${FORMAT:-md}"
|
||||
DRY_RUN=0
|
||||
SNAPSHOT=1
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage: run_h100_single_node_all.sh [options]
|
||||
|
||||
Options:
|
||||
--python PATH Python executable (default: /root/gpu-test-venv/bin/python)
|
||||
--config PATH gpu_tester config file (default: configs/default.yaml)
|
||||
--out-dir PATH Report output directory (default: reports)
|
||||
--format FORMAT Report format: md, json, or html (default: md)
|
||||
--no-snapshot Do not run nccl_environment_snapshot.sh first
|
||||
--dry-run Print commands without running them
|
||||
-h, --help Show this help
|
||||
EOF
|
||||
}
|
||||
|
||||
while (($#)); do
|
||||
case "$1" in
|
||||
--python)
|
||||
PYTHON_BIN="$2"
|
||||
shift 2
|
||||
;;
|
||||
--config)
|
||||
CONFIG_FILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--out-dir)
|
||||
OUT_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--format)
|
||||
FORMAT="$2"
|
||||
shift 2
|
||||
;;
|
||||
--no-snapshot)
|
||||
SNAPSHOT=0
|
||||
shift
|
||||
;;
|
||||
--dry-run)
|
||||
DRY_RUN=1
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
usage >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
|
||||
echo "Unsupported format: $FORMAT" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [[ ! -x "$PYTHON_BIN" ]]; then
|
||||
PYTHON_BIN="$(command -v python3 || true)"
|
||||
fi
|
||||
|
||||
if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
|
||||
echo "Python executable not found. Set --python or PYTHON_BIN." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
HOST="$(hostname 2>/dev/null || echo unknown)"
|
||||
TS="$(date +%Y%m%d_%H%M%S)"
|
||||
mkdir -p "$OUT_DIR"
|
||||
|
||||
SNAPSHOT_FILE="$OUT_DIR/nccl_environment_snapshot_${HOST}_${TS}.md"
|
||||
REPORT_FILE="$OUT_DIR/h100_single_node_all_${HOST}_${TS}.${FORMAT}"
|
||||
|
||||
snapshot_cmd=(bash "$PROJECT_DIR/scripts/nccl_environment_snapshot.sh" "$SNAPSHOT_FILE")
|
||||
test_cmd=(
|
||||
"$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
|
||||
--config "$CONFIG_FILE"
|
||||
--test all
|
||||
--report
|
||||
--format "$FORMAT"
|
||||
--output "$REPORT_FILE"
|
||||
)
|
||||
|
||||
echo "Project: $PROJECT_DIR"
|
||||
echo "Host: $HOST"
|
||||
echo "Config: $CONFIG_FILE"
|
||||
echo "Report: $REPORT_FILE"
|
||||
if ((SNAPSHOT)); then
|
||||
echo "Snapshot: $SNAPSHOT_FILE"
|
||||
fi
|
||||
|
||||
if ((DRY_RUN)); then
|
||||
if ((SNAPSHOT)); then
|
||||
printf 'DRY RUN snapshot:'
|
||||
printf ' %q' "${snapshot_cmd[@]}"
|
||||
printf '\n'
|
||||
fi
|
||||
printf 'DRY RUN test:'
|
||||
printf ' %q' "${test_cmd[@]}"
|
||||
printf '\n'
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if ((SNAPSHOT)); then
|
||||
"${snapshot_cmd[@]}"
|
||||
fi
|
||||
|
||||
"${test_cmd[@]}"
|
||||
status=$?
|
||||
|
||||
echo "Report written to: $REPORT_FILE"
|
||||
if ((SNAPSHOT)); then
|
||||
echo "Snapshot written to: $SNAPSHOT_FILE"
|
||||
fi
|
||||
|
||||
exit "$status"
|
||||
Loading…
x
Reference in New Issue
Block a user