Add multinode NCCL PDF matrix runner
This commit is contained in:
parent
bdffd7e616
commit
8ff5021385
@ -23,7 +23,7 @@ multinode_nccl:
|
|||||||
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||||||
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
||||||
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||||||
nccl_tests_dir: null
|
nccl_tests_dir: /data/nccl-tests-latest/build
|
||||||
tests:
|
tests:
|
||||||
- all_reduce_perf
|
- all_reduce_perf
|
||||||
- alltoall_perf
|
- alltoall_perf
|
||||||
|
|||||||
@ -23,7 +23,7 @@ multinode_nccl:
|
|||||||
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||||||
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
||||||
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||||||
nccl_tests_dir: null
|
nccl_tests_dir: /data/nccl-tests-latest/build
|
||||||
tests:
|
tests:
|
||||||
- all_reduce_perf
|
- all_reduce_perf
|
||||||
- alltoall_perf
|
- alltoall_perf
|
||||||
|
|||||||
@ -23,7 +23,7 @@ multinode_nccl:
|
|||||||
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||||||
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
||||||
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||||||
nccl_tests_dir: null
|
nccl_tests_dir: /data/nccl-tests-latest/build
|
||||||
tests:
|
tests:
|
||||||
- all_reduce_perf
|
- all_reduce_perf
|
||||||
- alltoall_perf
|
- alltoall_perf
|
||||||
|
|||||||
@ -23,7 +23,7 @@ multinode_nccl:
|
|||||||
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||||||
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
||||||
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||||||
nccl_tests_dir: null
|
nccl_tests_dir: /data/nccl-tests-latest/build
|
||||||
tests:
|
tests:
|
||||||
- all_reduce_perf
|
- all_reduce_perf
|
||||||
- alltoall_perf
|
- alltoall_perf
|
||||||
|
|||||||
@ -23,7 +23,7 @@ multinode_nccl:
|
|||||||
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||||||
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
||||||
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||||||
nccl_tests_dir: null
|
nccl_tests_dir: /data/nccl-tests-latest/build
|
||||||
tests:
|
tests:
|
||||||
- all_reduce_perf
|
- all_reduce_perf
|
||||||
- alltoall_perf
|
- alltoall_perf
|
||||||
|
|||||||
@ -24,6 +24,16 @@ bash scripts/multinode_nccl_deep_diagnose.sh preflight
|
|||||||
bash scripts/multinode_nccl_deep_diagnose.sh all
|
bash scripts/multinode_nccl_deep_diagnose.sh all
|
||||||
```
|
```
|
||||||
|
|
||||||
|
如果要按 PDF 参考矩阵跑正式多机多卡报告,使用:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /root/test_gpu_scripts
|
||||||
|
bash scripts/run_multinode_nccl_pdf_matrix.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
它会跑 2 机 x 1/2/4/8 GPU per node 的 `all_reduce_perf` 和 `alltoall_perf`,输出到
|
||||||
|
`reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS.md`。
|
||||||
|
|
||||||
默认输出目录为:
|
默认输出目录为:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
@ -63,7 +73,7 @@ bash scripts/multinode_nccl_deep_diagnose.sh all
|
|||||||
如果 nccl-tests 或 NCCL 运行库路径变化:
|
如果 nccl-tests 或 NCCL 运行库路径变化:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
NCCL_TESTS_DIR=/opt/gpu-test-tools/nccl-tests/build \
|
NCCL_TESTS_DIR=/data/nccl-tests-latest/build \
|
||||||
NCCL_LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/path/to/nccl/lib:/usr/local/cuda/lib64 \
|
NCCL_LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/path/to/nccl/lib:/usr/local/cuda/lib64 \
|
||||||
bash scripts/multinode_nccl_deep_diagnose.sh graph
|
bash scripts/multinode_nccl_deep_diagnose.sh graph
|
||||||
```
|
```
|
||||||
|
|||||||
@ -119,6 +119,13 @@ cd /root/test_gpu_scripts
|
|||||||
bash scripts/run_h100_single_node_all.sh
|
bash scripts/run_h100_single_node_all.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### 多机多卡 PDF 矩阵
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /root/test_gpu_scripts
|
||||||
|
bash scripts/run_multinode_nccl_pdf_matrix.sh
|
||||||
|
```
|
||||||
|
|
||||||
### 完整深度诊断
|
### 完整深度诊断
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -155,6 +162,8 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
|
|||||||
| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 |
|
| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 |
|
||||||
| `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 |
|
| `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 |
|
||||||
| `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 |
|
| `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 |
|
||||||
|
| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口 |
|
||||||
|
| `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置 |
|
||||||
|
|
||||||
## 当前建议
|
## 当前建议
|
||||||
|
|
||||||
|
|||||||
@ -28,8 +28,17 @@
|
|||||||
| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 |
|
| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 |
|
||||||
| `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本,不启动 NCCL workload |
|
| `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本,不启动 NCCL workload |
|
||||||
| `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口,默认同时采环境快照 |
|
| `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口,默认同时采环境快照 |
|
||||||
|
| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口,跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall |
|
||||||
|
| `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置,固定 NCCL 2.27.7 和 `/data/nccl-tests-latest/build` |
|
||||||
| `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |
|
| `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |
|
||||||
|
|
||||||
|
多机多卡 PDF 矩阵:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /root/test_gpu_scripts
|
||||||
|
bash scripts/run_multinode_nccl_pdf_matrix.sh
|
||||||
|
```
|
||||||
|
|
||||||
单节点 H100 原始 all 报告:
|
单节点 H100 原始 all 报告:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
142
scripts/run_multinode_nccl_pdf_matrix.sh
Executable file
142
scripts/run_multinode_nccl_pdf_matrix.sh
Executable file
@ -0,0 +1,142 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -uo pipefail
|
||||||
|
|
||||||
|
# Run the formal cross-node NCCL PDF matrix for the current two-node H100 pair.
|
||||||
|
# This wrapper standardizes the command, output naming, and preflight hook; the
|
||||||
|
# actual benchmark implementation remains in gpu_tester.py / MultiNodeNCCLTest.
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
||||||
|
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
|
||||||
|
|
||||||
|
PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
|
||||||
|
CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/multinode_nccl_nccl227_pdf_matrix.yaml}"
|
||||||
|
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
|
||||||
|
FORMAT="${FORMAT:-md}"
|
||||||
|
DRY_RUN=0
|
||||||
|
RUN_PREFLIGHT=1
|
||||||
|
PREFLIGHT_ONLY=0
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<'EOF'
|
||||||
|
Usage: run_multinode_nccl_pdf_matrix.sh [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--python PATH Python executable (default: /root/gpu-test-venv/bin/python)
|
||||||
|
--config PATH Matrix config file (default: configs/multinode_nccl_nccl227_pdf_matrix.yaml)
|
||||||
|
--out-dir PATH Report output directory (default: reports)
|
||||||
|
--format FORMAT Report format: md, json, or html (default: md)
|
||||||
|
--no-preflight Skip scripts/multinode_nccl_deep_diagnose.sh preflight
|
||||||
|
--preflight-only Run only the preflight check, not the matrix workload
|
||||||
|
--dry-run Print commands without running them
|
||||||
|
-h, --help Show this help
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
while (($#)); do
|
||||||
|
case "$1" in
|
||||||
|
--python)
|
||||||
|
PYTHON_BIN="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--config)
|
||||||
|
CONFIG_FILE="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--out-dir)
|
||||||
|
OUT_DIR="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--format)
|
||||||
|
FORMAT="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--no-preflight)
|
||||||
|
RUN_PREFLIGHT=0
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--preflight-only)
|
||||||
|
PREFLIGHT_ONLY=1
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--dry-run)
|
||||||
|
DRY_RUN=1
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
-h|--help)
|
||||||
|
usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Unknown argument: $1" >&2
|
||||||
|
usage >&2
|
||||||
|
exit 2
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
|
||||||
|
echo "Unsupported format: $FORMAT" >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -x "$PYTHON_BIN" ]]; then
|
||||||
|
PYTHON_BIN="$(command -v python3 || true)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
|
||||||
|
echo "Python executable not found. Set --python or PYTHON_BIN." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
TS="$(date +%Y%m%d_%H%M%S)"
|
||||||
|
mkdir -p "$OUT_DIR"
|
||||||
|
|
||||||
|
REPORT_FILE="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}.${FORMAT}"
|
||||||
|
PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight)
|
||||||
|
MATRIX_CMD=(
|
||||||
|
"$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
|
||||||
|
--config "$CONFIG_FILE"
|
||||||
|
--test multinode-nccl
|
||||||
|
--report
|
||||||
|
--format "$FORMAT"
|
||||||
|
--output "$REPORT_FILE"
|
||||||
|
)
|
||||||
|
|
||||||
|
echo "Project: $PROJECT_DIR"
|
||||||
|
echo "Config: $CONFIG_FILE"
|
||||||
|
echo "Report: $REPORT_FILE"
|
||||||
|
echo "Matrix: 2 nodes x {1,2,4,8} GPUs per node; all_reduce_perf + alltoall_perf; 16G"
|
||||||
|
|
||||||
|
if ((DRY_RUN)); then
|
||||||
|
if ((RUN_PREFLIGHT)); then
|
||||||
|
printf 'DRY RUN preflight:'
|
||||||
|
printf ' %q' "${PREFLIGHT_CMD[@]}"
|
||||||
|
printf '\n'
|
||||||
|
fi
|
||||||
|
if ((PREFLIGHT_ONLY)); then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
printf 'DRY RUN matrix:'
|
||||||
|
printf ' %q' "${MATRIX_CMD[@]}"
|
||||||
|
printf '\n'
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ((RUN_PREFLIGHT)); then
|
||||||
|
"${PREFLIGHT_CMD[@]}"
|
||||||
|
preflight_status=$?
|
||||||
|
if ((preflight_status != 0)); then
|
||||||
|
echo "Preflight failed with exit code $preflight_status" >&2
|
||||||
|
exit "$preflight_status"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ((PREFLIGHT_ONLY)); then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
"${MATRIX_CMD[@]}"
|
||||||
|
status=$?
|
||||||
|
|
||||||
|
echo "Report written to: $REPORT_FILE"
|
||||||
|
exit "$status"
|
||||||
Loading…
x
Reference in New Issue
Block a user